Merge branch 'develop' of https://github.com/paboyle/Grid into merge

2025-09-19 01:31:04 +01:00 · 2018-03-07 15:24:11 -05:00
parent ebb1bebf24 a7d19dbb64
commit 0b63e2e9cd
175 changed files with 12512 additions and 4800 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -93,6 +93,7 @@ build*/*
 *.xcodeproj/*
 build.sh
 .vscode
 *.code-workspace
 # Eigen source #
 ################
@@ -123,3 +124,8 @@ make-bin-BUCK.sh
 lib/qcd/spin/gamma-gen/*.h
 lib/qcd/spin/gamma-gen/*.cc
 # vs code editor files #
 ########################
 .vscode/
 .vscode/settings.json
 settings.json
--- a/.travis.yml
+++ b/.travis.yml
@@ -44,3 +44,4 @@ script:
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
--- a/README.md
+++ b/README.md
@@ -187,10 +187,11 @@ Alternatively, some CPU codenames can be directly used:
 | `<code>`    | Description                            |
 | ----------- | -------------------------------------- |
 | `KNL`       | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
 | `SKL`       | [Intel Skylake with AVX512 extensions](https://ark.intel.com/products/codename/37572/Skylake#@server) |
 | `BGQ`       | Blue Gene/Q                            |
 #### Notes:
- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
+- We currently support AVX512 for the Intel compiler and GCC (KNL and SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced.
 - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
 - BG/Q performances are currently rather poor. This is being investigated for future versions.
 - The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`.
--- a/36
+++ b/36
@@ -1,20 +1,36 @@
 TODO:
 ---------------
-Large item work list:
+Code item work list
 a) namespaces & indentation
 GRID_BEGIN_NAMESPACE();
 GRID_END_NAMESPACE();
 -- delete QCD namespace
 b) GPU branch
 - start branch
 - Increase Macro use in core library support; prepare for change
 - Audit volume of "device" code
 - Virtual function audit
 - Start port once Nvidia box is up
 - Cut down volume of code for first port? How?
 Physics item work list:
 1)- BG/Q port and check ; Andrew says ok.
-2)- Christoph's local basis expansion Lanczos
+2)- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
--
+3)- Physical propagator interface
-3a)- RNG I/O in ILDG/SciDAC (minor)
+4)- Multigrid Wilson and DWF, compare to other Multigrid implementations
-3b)- Precision conversion and sort out localConvert      <-- partial/easy
+5)- HDCR resume
 3c)- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
 4)- Physical propagator interface
 5)- Conserved currents
 6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 7)- HDCR resume
 ----------------------------
 Recent DONE 
 -- RNG I/O in ILDG/SciDAC (minor) 
 -- Precision conversion and sort out localConvert      <-- partial/easy
 -- Conserved currents (Andrew)
 -- Split grid
 -- Christoph's local basis expansion Lanczos
 -- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O ; <-- DONE ; bmark cori
 -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 -- GaugeFix into central location                      <-- DONE
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -106,7 +106,7 @@ int main (int argc, char ** argv)
      for(int i=0;i<Nloop;i++){
      double start=usecond();
-	std::vector<CartesianCommunicator::CommsRequest_t> requests;
+	std::vector<CommsRequest_t> requests;
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
@@ -202,7 +202,7 @@ int main (int argc, char ** argv)
 	    int recv_from_rank;
 	    {
-	      std::vector<CartesianCommunicator::CommsRequest_t> requests;
+	      std::vector<CommsRequest_t> requests;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      Grid.SendToRecvFromBegin(requests,
 				       (void *)&xbuf[mu][0],
@@ -215,7 +215,7 @@ int main (int argc, char ** argv)
 	    comm_proc = mpi_layout[mu]-1;
 	    {
-	      std::vector<CartesianCommunicator::CommsRequest_t> requests;
+	      std::vector<CommsRequest_t> requests;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      Grid.SendToRecvFromBegin(requests,
 				       (void *)&xbuf[mu+4][0],
@@ -290,7 +290,7 @@ int main (int argc, char ** argv)
 	dbytes=0;
 	ncomm=0;
-	std::vector<CartesianCommunicator::CommsRequest_t> requests;
+	std::vector<CommsRequest_t> requests;
 	for(int mu=0;mu<4;mu++){
@@ -383,7 +383,7 @@ int main (int argc, char ** argv)
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
-	std::vector<CartesianCommunicator::CommsRequest_t> requests;
+	std::vector<CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
@@ -481,7 +481,7 @@ int main (int argc, char ** argv)
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
-	std::vector<CartesianCommunicator::CommsRequest_t> requests;
+	std::vector<CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -48,7 +48,6 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
  int Ls=16;
@@ -57,6 +56,10 @@ int main (int argc, char ** argv)
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
  GridLogLayout();
  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -187,7 +190,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
@@ -226,7 +229,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -277,7 +280,7 @@ int main (int argc, char ** argv)
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -355,7 +358,7 @@ int main (int argc, char ** argv)
      //      sDw.stat.print();
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
+      double flops=(single_site_flops*volume*ncall)/2.0;
      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
      std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
@@ -478,7 +481,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(1344.0*volume*ncall)/2;
+    double flops=(single_site_flops*volume*ncall)/2.0;
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -51,6 +51,7 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
@@ -107,6 +108,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
@@ -196,7 +198,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  if ( ! report ) {
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
  }
@@ -228,7 +230,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
    if(!report){
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
+      double flops=(single_site_flops*volume*ncall)/2.0;
      std::cout<< flops/(t1-t0);
    }
  }
@@ -237,6 +239,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
 #define CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {
  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -321,7 +324,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
    Counter.Report();
  } else { 
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
    std::cout<<"\t"<< flops/(t1-t0);
  }
@@ -358,7 +361,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
    CounterSdw.Report();
  } else {
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(1344.0*volume*ncall)/2;
+    double flops=(single_site_flops*volume*ncall)/2.0;
    std::cout<<"\t"<< flops/(t1-t0);
  }
 }
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@@ -107,7 +107,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
@@ -134,7 +134,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;
    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -174,7 +174,7 @@ int main (int argc, char ** argv)
    FGrid_d->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -4,7 +4,7 @@
    Source file: ./benchmarks/Benchmark_wilson.cc
-    Copyright (C) 2015
+    Copyright (C) 2018
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
@@ -32,6 +32,9 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 #include "Grid/util/Profiling.h"
 template<class d>
 struct scal {
  d internal;
@@ -45,6 +48,7 @@ struct scal {
  };
 bool overlapComms = false;
 bool perfProfiling = false;
 int main (int argc, char ** argv)
 {
@@ -53,6 +57,12 @@ int main (int argc, char ** argv)
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
    overlapComms = true;
  }
  if( GridCmdOptionExists(argv,argv+argc,"--perf") ){
    perfProfiling = true;
  }
  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
@@ -61,10 +71,15 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian     RBGrid(&Grid);
  int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
  GridLogLayout();
  std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
  std::cout<<GridLogMessage << "Grid number of colours : "<< QCD::Nc <<std::endl;
  std::cout<<GridLogMessage << "Benchmarking Wilson operator in the fundamental representation" << std::endl;
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);
@@ -134,9 +149,25 @@ int main (int argc, char ** argv)
    Dw.Dhop(src,result,0);
  }
  double t1=usecond();
-  double flops=1344*volume*ncall;
+  double flops=single_site_flops*volume*ncall;
  if (perfProfiling){
  std::cout<<GridLogMessage << "Profiling Dw with perf"<<std::endl;
  System::profile("kernel", [&]() {
    for(int i=0;i<ncall;i++){
      Dw.Dhop(src,result,0);
    }
  });
  std::cout<<GridLogMessage << "Generated kernel.data"<<std::endl;
  std::cout<<GridLogMessage << "Use with: perf report -i kernel.data"<<std::endl;
  }
  std::cout<<GridLogMessage << "Called Dw"<<std::endl;
  std::cout<<GridLogMessage << "flops per site " << single_site_flops << std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -62,6 +62,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Number of colours "<< QCD::Nc <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
@@ -69,13 +70,15 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage << "* OpenMP threads       : "<< GridThread::GetThreads() <<std::endl;
  std::cout << GridLogMessage << "* MPI tasks            : "<< GridCmdVectorIntToString(mpi_layout) << std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking Wilson operator in the fundamental representation" << std::endl;
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
-  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
+  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs\tWilsonEO/MFLOPs\tWilsonDagEO/MFLOPs" << std::endl;
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
  int Lmax = 32;
  int dmin = 0;
@@ -98,12 +101,19 @@ int main (int argc, char ** argv)
 	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
 	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 	  LatticeFermion        src(&Grid); random(pRNG,src);
 	  LatticeFermion    src_o(&RBGrid); pickCheckerboard(Odd,src_o,src);
 	  LatticeFermion     result(&Grid); result=zero;
 	  LatticeFermion result_e(&RBGrid); result_e=zero;
 	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
 	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
    // Full operator      
 	  bench_wilson(src,result,Dw,volume,DaggerNo);
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
    std::cout << "\t";
    // EO
 	  bench_wilson(src,result,Dw,volume,DaggerNo);
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
 	  std::cout << std::endl;
@@ -122,9 +132,26 @@ void bench_wilson (
 		   int const           dag )
 {
  int ncall    = 1000;
  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  double t0    = usecond();
  for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
  double t1    = usecond();
-  double flops = 1344 * volume * ncall;
+  double flops = single_site_flops * volume * ncall;
  std::cout << flops/(t1-t0) << "\t\t";
 }
 void bench_wilson_eo (
 		   LatticeFermion &    src,
 		   LatticeFermion & result,
 		   WilsonFermionR &     Dw,
 		   double const     volume,
 		   int const           dag )
 {
  int ncall    = 1000;
  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  double t0    = usecond();
  for(int i=0; i<ncall; i++) { Dw.DhopEO(src,result,dag); }
  double t1    = usecond();
  double flops = (single_site_flops * volume * ncall)/2.0;
  std::cout << flops/(t1-t0) << "\t\t";
 }
--- a/configure.ac
+++ b/configure.ac
@@ -340,15 +340,11 @@ case ${ac_PRECISION} in
 esac
 ######################  Shared memory allocation technique under MPI3
-AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmget|shmopen|hugetlbfs],
+AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|hugetlbfs],
              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
 case ${ac_SHM} in
     shmget)
     AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
     ;;
     shmopen)
     AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
     ;;
@@ -370,7 +366,7 @@ AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
 AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
 ############### communication type selection
-AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem],
+AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto],
              [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
 case ${ac_COMMS} in
@@ -378,22 +374,10 @@ case ${ac_COMMS} in
        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
        comms_type='none'
     ;;
-     mpi3*)
+     mpi*)
        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
        comms_type='mpi3'
     ;;
     mpit)
        AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
        comms_type='mpit'
     ;;
     mpi*)
        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
        comms_type='mpi'
     ;;
     shmem)
        AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
        comms_type='shmem'
     ;;
     *)
        AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);
     ;;
--- a/extras/Hadrons/Application.cc
+++ b/extras/Hadrons/Application.cc
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Application.cc
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -43,12 +42,7 @@ using namespace Hadrons;
 // constructors ////////////////////////////////////////////////////////////////
 Application::Application(void)
 {
-    LOG(Message) << "Modules available:" << std::endl;
+    initLogger();
    auto list = ModuleFactory::getInstance().getBuilderList();
    for (auto &m: list)
    {
        LOG(Message) << "  " << m << std::endl;
    }
    auto dim = GridDefaultLatt(), mpi = GridDefaultMpi(), loc(dim);
    locVol_ = 1;
    for (unsigned int d = 0; d < dim.size(); ++d)
@@ -73,12 +67,6 @@ Application::Application(const std::string parameterFileName)
    parameterFileName_ = parameterFileName;
 }
 // environment shortcut ////////////////////////////////////////////////////////
 Environment & Application::env(void) const
 {
    return Environment::getInstance();
 }
 // access //////////////////////////////////////////////////////////////////////
 void Application::setPar(const Application::GlobalPar &par)
 {
@@ -94,14 +82,13 @@ const Application::GlobalPar & Application::getPar(void)
 // execute /////////////////////////////////////////////////////////////////////
 void Application::run(void)
 {
-    if (!parameterFileName_.empty() and (env().getNModule() == 0))
+    if (!parameterFileName_.empty() and (vm().getNModule() == 0))
    {
        parseParameterFile(parameterFileName_);
    }
-    if (!scheduled_)
+    vm().printContent();
-    {
+    env().printContent();
    schedule();
    }
    printSchedule();
    configLoop();
 }
@@ -124,12 +111,20 @@ void Application::parseParameterFile(const std::string parameterFileName)
    LOG(Message) << "Building application from '" << parameterFileName << "'..." << std::endl;
    read(reader, "parameters", par);
    setPar(par);
-    push(reader, "modules");
+    if (!push(reader, "modules"))
-    push(reader, "module");
+    {
        HADRON_ERROR(Parsing, "Cannot open node 'modules' in parameter file '" 
                              + parameterFileName + "'");
    }
    if (!push(reader, "module"))
    {
        HADRON_ERROR(Parsing, "Cannot open node 'modules/module' in parameter file '" 
                              + parameterFileName + "'");
    }
    do
    {
        read(reader, "id", id);
-        env().createModule(id.name, id.type, reader);
+        vm().createModule(id.name, id.type, reader);
    } while (reader.nextElement("module"));
    pop(reader);
    pop(reader);
@@ -139,7 +134,7 @@ void Application::saveParameterFile(const std::string parameterFileName)
 {
    XmlWriter          writer(parameterFileName);
    ObjectId           id;
-    const unsigned int nMod = env().getNModule();
+    const unsigned int nMod = vm().getNModule();
    LOG(Message) << "Saving application to '" << parameterFileName << "'..." << std::endl;
    write(writer, "parameters", getPar());
@@ -147,10 +142,10 @@ void Application::saveParameterFile(const std::string parameterFileName)
    for (unsigned int i = 0; i < nMod; ++i)
    {
        push(writer, "module");
-        id.name = env().getModuleName(i);
+        id.name = vm().getModuleName(i);
-        id.type = env().getModule(i)->getRegisteredName();
+        id.type = vm().getModule(i)->getRegisteredName();
        write(writer, "id", id);
-        env().getModule(i)->saveParameters(writer, "options");
+        vm().getModule(i)->saveParameters(writer, "options");
        pop(writer);
    }
    pop(writer);
@@ -158,96 +153,14 @@ void Application::saveParameterFile(const std::string parameterFileName)
 }
 // schedule computation ////////////////////////////////////////////////////////
 #define MEM_MSG(size)\
 sizeString((size)*locVol_) << " (" << sizeString(size)  << "/site)"
 #define DEFINE_MEMPEAK \
 GeneticScheduler<unsigned int>::ObjFunc memPeak = \
 [this](const std::vector<unsigned int> &program)\
 {\
    unsigned int memPeak;\
    bool         msg;\
    \
    msg = HadronsLogMessage.isActive();\
    HadronsLogMessage.Active(false);\
    env().dryRun(true);\
    memPeak = env().executeProgram(program);\
    env().dryRun(false);\
    env().freeAll();\
    HadronsLogMessage.Active(true);\
    \
    return memPeak;\
 }
 void Application::schedule(void)
 {
-    DEFINE_MEMPEAK;
+    if (!scheduled_ and !loadedSchedule_)
    // build module dependency graph
    LOG(Message) << "Building module graph..." << std::endl;
    auto graph = env().makeModuleGraph();
    auto con = graph.getConnectedComponents();
    // constrained topological sort using a genetic algorithm
    LOG(Message) << "Scheduling computation..." << std::endl;
    LOG(Message) << "               #module= " << graph.size() << std::endl;
    LOG(Message) << "       population size= " << par_.genetic.popSize << std::endl;
    LOG(Message) << "       max. generation= " << par_.genetic.maxGen << std::endl;
    LOG(Message) << "  max. cst. generation= " << par_.genetic.maxCstGen << std::endl;
    LOG(Message) << "         mutation rate= " << par_.genetic.mutationRate << std::endl;
    unsigned int                               k = 0, gen, prevPeak, nCstPeak = 0;
    std::random_device                         rd;
    GeneticScheduler<unsigned int>::Parameters par;
    par.popSize      = par_.genetic.popSize;
    par.mutationRate = par_.genetic.mutationRate;
    par.seed         = rd();
    memPeak_         = 0;
    CartesianCommunicator::BroadcastWorld(0, &(par.seed), sizeof(par.seed));
    for (unsigned int i = 0; i < con.size(); ++i)
    {
-        GeneticScheduler<unsigned int> scheduler(con[i], memPeak, par);
+        program_   = vm().schedule(par_.genetic);
        gen = 0;
        do
        {
            LOG(Debug) << "Generation " << gen << ":" << std::endl;
            scheduler.nextGeneration();
            if (gen != 0)
            {
                if (prevPeak == scheduler.getMinValue())
                {
                    nCstPeak++;
                }
                else
                {
                    nCstPeak = 0;
                }
            }
            prevPeak = scheduler.getMinValue();
            if (gen % 10 == 0)
            {
                LOG(Iterative) << "Generation " << gen << ": "
                               << MEM_MSG(scheduler.getMinValue()) << std::endl;
            }
            gen++;
        } while ((gen < par_.genetic.maxGen)
                 and (nCstPeak < par_.genetic.maxCstGen));
        auto &t = scheduler.getMinSchedule();
        if (scheduler.getMinValue() > memPeak_)
        {
            memPeak_ = scheduler.getMinValue();
        }
        for (unsigned int j = 0; j < t.size(); ++j)
        {
            program_.push_back(t[j]);
        }
    }
        scheduled_ = true;
    }
 }
 void Application::saveSchedule(const std::string filename)
 {
@@ -256,21 +169,19 @@ void Application::saveSchedule(const std::string filename)
    if (!scheduled_)
    {
-        HADRON_ERROR("Computation not scheduled");
+        HADRON_ERROR(Definition, "Computation not scheduled");
    }
    LOG(Message) << "Saving current schedule to '" << filename << "'..."
                 << std::endl;
    for (auto address: program_)
    {
-        program.push_back(env().getModuleName(address));
+        program.push_back(vm().getModuleName(address));
    }
    write(writer, "schedule", program);
 }
 void Application::loadSchedule(const std::string filename)
 {
    DEFINE_MEMPEAK;
    TextReader               reader(filename);
    std::vector<std::string> program;
@@ -280,24 +191,24 @@ void Application::loadSchedule(const std::string filename)
    program_.clear();
    for (auto &name: program)
    {
-        program_.push_back(env().getModuleAddress(name));
+        program_.push_back(vm().getModuleAddress(name));
    }
-    scheduled_ = true;
+    loadedSchedule_ = true;
    memPeak_   = memPeak(program_);
 }
 void Application::printSchedule(void)
 {
    if (!scheduled_)
    {
-        HADRON_ERROR("Computation not scheduled");
+        HADRON_ERROR(Definition, "Computation not scheduled");
    }
-    LOG(Message) << "Schedule (memory peak: " << MEM_MSG(memPeak_) << "):"
+    auto peak = vm().memoryNeeded(program_);
    LOG(Message) << "Schedule (memory needed: " << sizeString(peak) << "):"
                 << std::endl;
    for (unsigned int i = 0; i < program_.size(); ++i)
    {
        LOG(Message) << std::setw(4) << i + 1 << ": "
-                     << env().getModuleName(program_[i]) << std::endl;
+                     << vm().getModuleName(program_[i]) << std::endl;
    }
 }
@@ -310,8 +221,8 @@ void Application::configLoop(void)
    {
        LOG(Message) << BIG_SEP << " Starting measurement for trajectory " << t
                     << " " << BIG_SEP << std::endl;
-        env().setTrajectory(t);
+        vm().setTrajectory(t);
-        env().executeProgram(program_);
+        vm().executeProgram(program_);
    }
    LOG(Message) << BIG_SEP << " End of measurement " << BIG_SEP << std::endl;
    env().freeAll();
--- a/extras/Hadrons/Application.hpp
+++ b/extras/Hadrons/Application.hpp
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Application.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -31,8 +30,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define Hadrons_Application_hpp_
 #include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Environment.hpp>
+#include <Grid/Hadrons/VirtualMachine.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/Modules.hpp>
 BEGIN_HADRONS_NAMESPACE
@@ -51,24 +49,12 @@ public:
                                        unsigned int, end,
                                        unsigned int, step);
    };
    class GeneticPar: Serializable
    {
    public:
        GeneticPar(void):
            popSize{20}, maxGen{1000}, maxCstGen{100}, mutationRate{.1} {};
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(GeneticPar,
                                        unsigned int, popSize,
                                        unsigned int, maxGen,
                                        unsigned int, maxCstGen,
                                        double      , mutationRate);
    };
    class GlobalPar: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar,
                                        TrajRange,                  trajCounter,
-                                        GeneticPar,  genetic,
+                                        VirtualMachine::GeneticPar, genetic,
                                        std::string,                seed);
    };
 public:
@@ -100,14 +86,15 @@ public:
    void configLoop(void);
 private:
    // environment shortcut
-    Environment & env(void) const;
+    DEFINE_ENV_ALIAS;
    // virtual machine shortcut
    DEFINE_VM_ALIAS;
 private:
    long unsigned int       locVol_;
    std::string             parameterFileName_{""};
    GlobalPar               par_;
-    std::vector<unsigned int> program_;
+    VirtualMachine::Program program_;
-    Environment::Size         memPeak_;
+    bool                    scheduled_{false}, loadedSchedule_{false};
    bool                      scheduled_{false};
 };
 /******************************************************************************
@@ -117,14 +104,16 @@ private:
 template <typename M>
 void Application::createModule(const std::string name)
 {
-    env().createModule<M>(name);
+    vm().createModule<M>(name);
    scheduled_ = false;
 }
 template <typename M>
 void Application::createModule(const std::string name,
                               const typename M::Par &par)
 {
-    env().createModule<M>(name, par);
+    vm().createModule<M>(name, par);
    scheduled_ = false;
 }
 END_HADRONS_NAMESPACE
--- a/extras/Hadrons/Environment.cc
+++ b/extras/Hadrons/Environment.cc
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Environment.cc
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -35,6 +34,9 @@ using namespace Grid;
 using namespace QCD;
 using namespace Hadrons;
 #define ERROR_NO_ADDRESS(address)\
 HADRON_ERROR(Definition, "no object with address " + std::to_string(address));
 /******************************************************************************
 *                       Environment implementation                           *
 ******************************************************************************/
@@ -56,28 +58,6 @@ Environment::Environment(void)
    rng4d_.reset(new GridParallelRNG(grid4d_.get()));
 }
 // dry run /////////////////////////////////////////////////////////////////////
 void Environment::dryRun(const bool isDry)
 {
    dryRun_ = isDry;
 }
 bool Environment::isDryRun(void) const
 {
    return dryRun_;
 }
 // trajectory number ///////////////////////////////////////////////////////////
 void Environment::setTrajectory(const unsigned int traj)
 {
    traj_ = traj;
 }
 unsigned int Environment::getTrajectory(void) const
 {
    return traj_;
 }
 // grids ///////////////////////////////////////////////////////////////////////
 void Environment::createGrid(const unsigned int Ls)
 {
@@ -105,7 +85,7 @@ GridCartesian * Environment::getGrid(const unsigned int Ls) const
    }
    catch(std::out_of_range &)
    {
-        HADRON_ERROR("no grid with Ls= " << Ls);
+        HADRON_ERROR(Definition, "no grid with Ls= " + std::to_string(Ls));
    }
 }
@@ -124,7 +104,7 @@ GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls) const
    }
    catch(std::out_of_range &)
    {
-        HADRON_ERROR("no red-black 5D grid with Ls= " << Ls);
+        HADRON_ERROR(Definition, "no red-black 5D grid with Ls= " + std::to_string(Ls));
    }
 }
@@ -143,6 +123,11 @@ int Environment::getDim(const unsigned int mu) const
    return dim_[mu];
 }
 unsigned long int Environment::getLocalVolume(void) const
 {
    return locVol_;
 }
 // random number generator /////////////////////////////////////////////////////
 void Environment::setSeed(const std::vector<int> &seed)
 {
@@ -154,291 +139,6 @@ GridParallelRNG * Environment::get4dRng(void) const
    return rng4d_.get();
 }
 // module management ///////////////////////////////////////////////////////////
 void Environment::pushModule(Environment::ModPt &pt)
 {
    std::string name = pt->getName();
    if (!hasModule(name))
    {
        std::vector<unsigned int> inputAddress;
        unsigned int              address;
        ModuleInfo                m;
        m.data = std::move(pt);
        m.type = typeIdPt(*m.data.get());
        m.name = name;
        auto input  = m.data->getInput();
        for (auto &in: input)
        {
            if (!hasObject(in))
            {
                addObject(in , -1);
            }
            m.input.push_back(objectAddress_[in]);
        }
        auto output = m.data->getOutput();
        module_.push_back(std::move(m));
        address              = static_cast<unsigned int>(module_.size() - 1);
        moduleAddress_[name] = address;
        for (auto &out: output)
        {
            if (!hasObject(out))
            {
                addObject(out, address);
            }
            else
            {
                if (object_[objectAddress_[out]].module < 0)
                {
                    object_[objectAddress_[out]].module = address;
                }
                else
                {
                    HADRON_ERROR("object '" + out
                                 + "' is already produced by module '"
                                 + module_[object_[getObjectAddress(out)].module].name
                                 + "' (while pushing module '" + name + "')");
                }
            }
        }
    }
    else
    {
        HADRON_ERROR("module '" + name + "' already exists");
    }
 }
 unsigned int Environment::getNModule(void) const
 {
    return module_.size();
 }
 void Environment::createModule(const std::string name, const std::string type,
                               XmlReader &reader)
 {
    auto &factory = ModuleFactory::getInstance();
    auto pt       = factory.create(type, name);
    pt->parseParameters(reader, "options");
    pushModule(pt);
 }
 ModuleBase * Environment::getModule(const unsigned int address) const
 {
    if (hasModule(address))
    {
        return module_[address].data.get();
    }
    else
    {
        HADRON_ERROR("no module with address " + std::to_string(address));
    }
 }
 ModuleBase * Environment::getModule(const std::string name) const
 {
    return getModule(getModuleAddress(name));
 }
 unsigned int Environment::getModuleAddress(const std::string name) const
 {
    if (hasModule(name))
    {
        return moduleAddress_.at(name);
    }
    else
    {
        HADRON_ERROR("no module with name '" + name + "'");
    }
 }
 std::string Environment::getModuleName(const unsigned int address) const
 {
    if (hasModule(address))
    {
        return module_[address].name;
    }
    else
    {
        HADRON_ERROR("no module with address " + std::to_string(address));
    }
 }
 std::string Environment::getModuleType(const unsigned int address) const
 {
    if (hasModule(address))
    {
        return typeName(module_[address].type);
    }
    else
    {
        HADRON_ERROR("no module with address " + std::to_string(address));
    }
 }
 std::string Environment::getModuleType(const std::string name) const
 {
    return getModuleType(getModuleAddress(name));
 }
 std::string Environment::getModuleNamespace(const unsigned int address) const
 {
    std::string type = getModuleType(address), ns;
    auto pos2 = type.rfind("::");
    auto pos1 = type.rfind("::", pos2 - 2);
    return type.substr(pos1 + 2, pos2 - pos1 - 2);
 }
 std::string Environment::getModuleNamespace(const std::string name) const
 {
    return getModuleNamespace(getModuleAddress(name));
 }
 bool Environment::hasModule(const unsigned int address) const
 {
    return (address < module_.size());
 }
 bool Environment::hasModule(const std::string name) const
 {
    return (moduleAddress_.find(name) != moduleAddress_.end());
 }
 Graph<unsigned int> Environment::makeModuleGraph(void) const
 {
    Graph<unsigned int> moduleGraph;
    for (unsigned int i = 0; i < module_.size(); ++i)
    {
        moduleGraph.addVertex(i);
        for (auto &j: module_[i].input)
        {
            moduleGraph.addEdge(object_[j].module, i);
        }
    }
    return moduleGraph;
 }
 #define BIG_SEP "==============="
 #define SEP     "---------------"
 #define MEM_MSG(size)\
 sizeString((size)*locVol_) << " (" << sizeString(size)  << "/site)"
 Environment::Size
 Environment::executeProgram(const std::vector<unsigned int> &p)
 {
    Size                                memPeak = 0, sizeBefore, sizeAfter;
    std::vector<std::set<unsigned int>> freeProg;
    bool                                continueCollect, nothingFreed;
    // build garbage collection schedule
    freeProg.resize(p.size());
    for (unsigned int i = 0; i < object_.size(); ++i)
    {
        auto pred = [i, this](const unsigned int j)
        {
            auto &in = module_[j].input;
            auto it  = std::find(in.begin(), in.end(), i);
            return (it != in.end()) or (j == object_[i].module);
        };
        auto it = std::find_if(p.rbegin(), p.rend(), pred);
        if (it != p.rend())
        {
            freeProg[p.rend() - it - 1].insert(i);
        }
    }
    // program execution
    for (unsigned int i = 0; i < p.size(); ++i)
    {
        // execute module
        if (!isDryRun())
        {
            LOG(Message) << SEP << " Measurement step " << i+1 << "/"
                         << p.size() << " (module '" << module_[p[i]].name
                         << "') " << SEP << std::endl;
        }
        (*module_[p[i]].data)();
        sizeBefore = getTotalSize();
        // print used memory after execution
        if (!isDryRun())
        {
            LOG(Message) << "Allocated objects: " << MEM_MSG(sizeBefore)
                         << std::endl;
        }
        if (sizeBefore > memPeak)
        {
            memPeak = sizeBefore;
        }
        // garbage collection for step i
        if (!isDryRun())
        {
            LOG(Message) << "Garbage collection..." << std::endl;
        }
        nothingFreed = true;
        do
        {
            continueCollect = false;
            auto toFree = freeProg[i];
            for (auto &j: toFree)
            {
                // continue garbage collection while there are still
                // objects without owners
                continueCollect = continueCollect or !hasOwners(j);
                if(freeObject(j))
                {
                    // if an object has been freed, remove it from
                    // the garbage collection schedule
                    freeProg[i].erase(j);
                    nothingFreed = false;
                }
            }
        } while (continueCollect);
        // any remaining objects in step i garbage collection schedule
        // is scheduled for step i + 1
        if (i + 1 < p.size())
        {
            for (auto &j: freeProg[i])
            {
                freeProg[i + 1].insert(j);
            }
        }
        // print used memory after garbage collection if necessary
        if (!isDryRun())
        {
            sizeAfter = getTotalSize();
            if (sizeBefore != sizeAfter)
            {
                LOG(Message) << "Allocated objects: " << MEM_MSG(sizeAfter)
                             << std::endl;
            }
            else
            {
                LOG(Message) << "Nothing to free" << std::endl;
            }
        }
    }
    return memPeak;
 }
 Environment::Size Environment::executeProgram(const std::vector<std::string> &p)
 {
    std::vector<unsigned int> pAddress;
    for (auto &n: p)
    {
        pAddress.push_back(getModuleAddress(n));
    }
    return executeProgram(pAddress);
 }
 // general memory management ///////////////////////////////////////////////////
 void Environment::addObject(const std::string name, const int moduleAddress)
 {
@@ -448,46 +148,25 @@ void Environment::addObject(const std::string name, const int moduleAddress)
        info.name   = name;
        info.module = moduleAddress;
        info.data   = nullptr;
        object_.push_back(std::move(info));
        objectAddress_[name] = static_cast<unsigned int>(object_.size() - 1);
    }
    else
    {
-        HADRON_ERROR("object '" + name + "' already exists");
+        HADRON_ERROR(Definition, "object '" + name + "' already exists");
    }
 }
-void Environment::registerObject(const unsigned int address,
+void Environment::setObjectModule(const unsigned int objAddress,
-                                 const unsigned int size, const unsigned int Ls)
+                                  const int modAddress)
 {
-    if (!hasRegisteredObject(address))
+    object_[objAddress].module = modAddress;
    {
        if (hasObject(address))
        {
            object_[address].size         = size;
            object_[address].Ls           = Ls;
            object_[address].isRegistered = true;
        }
        else
        {
            HADRON_ERROR("no object with address " + std::to_string(address));
        }
    }
    else
    {
        HADRON_ERROR("object with address " + std::to_string(address)
                     + " already registered");
    }
 }
-void Environment::registerObject(const std::string name,
+unsigned int Environment::getMaxAddress(void) const
                                 const unsigned int size, const unsigned int Ls)
 {
-    if (!hasObject(name))
+    return object_.size();
    {
        addObject(name);
    }
    registerObject(getObjectAddress(name), size, Ls);
 }
 unsigned int Environment::getObjectAddress(const std::string name) const
@@ -498,7 +177,7 @@ unsigned int Environment::getObjectAddress(const std::string name) const
    }
    else
    {
-        HADRON_ERROR("no object with name '" + name + "'");
+        HADRON_ERROR(Definition, "no object with name '" + name + "'");
    }
 }
@@ -510,13 +189,13 @@ std::string Environment::getObjectName(const unsigned int address) const
    }
    else
    {
-        HADRON_ERROR("no object with address " + std::to_string(address));
+        ERROR_NO_ADDRESS(address);
    }
 }
 std::string Environment::getObjectType(const unsigned int address) const
 {
-    if (hasRegisteredObject(address))
+    if (hasObject(address))
    {
        if (object_[address].type)
        {
@@ -527,14 +206,9 @@ std::string Environment::getObjectType(const unsigned int address) const
            return "<no type>";
        }
    }
    else if (hasObject(address))
    {
        HADRON_ERROR("object with address " + std::to_string(address)
                     + " exists but is not registered");
    }
    else
    {
-        HADRON_ERROR("no object with address " + std::to_string(address));
+        ERROR_NO_ADDRESS(address);
    }
 }
@@ -545,18 +219,13 @@ std::string Environment::getObjectType(const std::string name) const
 Environment::Size Environment::getObjectSize(const unsigned int address) const
 {
-    if (hasRegisteredObject(address))
+    if (hasObject(address))
    {
        return object_[address].size;
    }
    else if (hasObject(address))
    {
        HADRON_ERROR("object with address " + std::to_string(address)
                     + " exists but is not registered");
    }
    else
    {
-        HADRON_ERROR("no object with address " + std::to_string(address));
+        ERROR_NO_ADDRESS(address);
    }
 }
@@ -565,7 +234,24 @@ Environment::Size Environment::getObjectSize(const std::string name) const
    return getObjectSize(getObjectAddress(name));
 }
-unsigned int Environment::getObjectModule(const unsigned int address) const
+Environment::Storage Environment::getObjectStorage(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].storage;
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 Environment::Storage Environment::getObjectStorage(const std::string name) const
 {
    return getObjectStorage(getObjectAddress(name));
 }
 int Environment::getObjectModule(const unsigned int address) const
 {
    if (hasObject(address))
    {
@@ -573,29 +259,24 @@ unsigned int Environment::getObjectModule(const unsigned int address) const
    }
    else
    {
-        HADRON_ERROR("no object with address " + std::to_string(address));
+        ERROR_NO_ADDRESS(address);
    }
 }
-unsigned int Environment::getObjectModule(const std::string name) const
+int Environment::getObjectModule(const std::string name) const
 {
    return getObjectModule(getObjectAddress(name));
 }
 unsigned int Environment::getObjectLs(const unsigned int address) const
 {
-    if (hasRegisteredObject(address))
+    if (hasObject(address))
    {
        return object_[address].Ls;
    }
    else if (hasObject(address))
    {
        HADRON_ERROR("object with address " + std::to_string(address)
                     + " exists but is not registered");
    }
    else
    {
-        HADRON_ERROR("no object with address " + std::to_string(address));
+        ERROR_NO_ADDRESS(address);
    }
 }
@@ -616,30 +297,6 @@ bool Environment::hasObject(const std::string name) const
    return ((it != objectAddress_.end()) and hasObject(it->second));
 }
 bool Environment::hasRegisteredObject(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].isRegistered;
    }
    else
    {
        return false;
    }
 }
 bool Environment::hasRegisteredObject(const std::string name) const
 {
    if (hasObject(name))
    {
        return hasRegisteredObject(getObjectAddress(name));
    }
    else
    {
        return false;
    }
 }
 bool Environment::hasCreatedObject(const unsigned int address) const
 {
    if (hasObject(address))
@@ -679,93 +336,28 @@ Environment::Size Environment::getTotalSize(void) const
    Environment::Size size = 0;
    for (auto &o: object_)
    {
        if (o.isRegistered)
    {
        size += o.size;
    }
    }
    return size;
 }
-void Environment::addOwnership(const unsigned int owner,
+void Environment::freeObject(const unsigned int address)
                               const unsigned int property)
 {
-    if (hasObject(property))
+    if (hasCreatedObject(address))
    {
        object_[property].owners.insert(owner);
    }
    else
    {
        HADRON_ERROR("no object with address " + std::to_string(property));
    }
    if (hasObject(owner))
    {
        object_[owner].properties.insert(property);
    }
    else
    {
        HADRON_ERROR("no object with address " + std::to_string(owner));
    }
 }
 void Environment::addOwnership(const std::string owner,
                               const std::string property)
 {
    addOwnership(getObjectAddress(owner), getObjectAddress(property));
 }
 bool Environment::hasOwners(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return (!object_[address].owners.empty());
    }
    else
    {
        HADRON_ERROR("no object with address " + std::to_string(address));
    }
 }
 bool Environment::hasOwners(const std::string name) const
 {
    return hasOwners(getObjectAddress(name));
 }
 bool Environment::freeObject(const unsigned int address)
 {
    if (!hasOwners(address))
    {
        if (!isDryRun() and object_[address].isRegistered)
    {
        LOG(Message) << "Destroying object '" << object_[address].name
                     << "'" << std::endl;
    }
        for (auto &p: object_[address].properties)
        {
            object_[p].owners.erase(address);
        }
    object_[address].size = 0;
        object_[address].Ls           = 0;
        object_[address].isRegistered = false;
    object_[address].type = nullptr;
        object_[address].owners.clear();
        object_[address].properties.clear();
    object_[address].data.reset(nullptr);
        return true;
    }
    else
    {
        return false;
    }
 }
-bool Environment::freeObject(const std::string name)
+void Environment::freeObject(const std::string name)
 {
-    return freeObject(getObjectAddress(name));
+    freeObject(getObjectAddress(name));
 }
 void Environment::freeAll(void)
@@ -776,18 +368,24 @@ void Environment::freeAll(void)
    }
 }
-void Environment::printContent(void)
+void Environment::protectObjects(const bool protect)
 {
-    LOG(Message) << "Modules: " << std::endl;
+    protect_ = protect;
    for (unsigned int i = 0; i < module_.size(); ++i)
    {
        LOG(Message) << std::setw(4) << i << ": "
                     << getModuleName(i) << std::endl;
 }
-    LOG(Message) << "Objects: " << std::endl;
+
 bool Environment::objectsProtected(void) const
 {
    return protect_;
 }
 // print environment content ///////////////////////////////////////////////////
 void Environment::printContent(void) const
 {
    LOG(Debug) << "Objects: " << std::endl;
    for (unsigned int i = 0; i < object_.size(); ++i)
    {
-        LOG(Message) << std::setw(4) << i << ": "
+        LOG(Debug) << std::setw(4) << i << ": "
-                     << getObjectName(i) << std::endl;
+                   << getObjectName(i) << " ("
                   << sizeString(getObjectSize(i)) << ")" << std::endl;
    }
 }
--- a/extras/Hadrons/Environment.hpp
+++ b/extras/Hadrons/Environment.hpp
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Environment.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -31,20 +30,12 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define Hadrons_Environment_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Graph.hpp>
 #ifndef SITE_SIZE_TYPE
 #define SITE_SIZE_TYPE unsigned int
 #endif
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Global environment                                 *
 ******************************************************************************/
 // forward declaration of Module
 class ModuleBase;
 class Object
 {
 public:
@@ -66,123 +57,78 @@ private:
    std::unique_ptr<T> objPt_{nullptr};
 };
 #define DEFINE_ENV_ALIAS \
 inline Environment & env(void) const\
 {\
    return Environment::getInstance();\
 }
 class Environment
 {
    SINGLETON(Environment);
 public:
    typedef SITE_SIZE_TYPE                         Size;
    typedef std::unique_ptr<ModuleBase>            ModPt;
    typedef std::unique_ptr<GridCartesian>         GridPt;
    typedef std::unique_ptr<GridRedBlackCartesian> GridRbPt;
    typedef std::unique_ptr<GridParallelRNG>       RngPt;
-    typedef std::unique_ptr<LatticeBase>           LatticePt;
+    enum class Storage {object, cache, temporary};
 private:
    struct ModuleInfo
    {
        const std::type_info      *type{nullptr};
        std::string               name;
        ModPt                     data{nullptr};
        std::vector<unsigned int> input;
    };
    struct ObjInfo
    {
        Size                    size{0};
        Storage                 storage{Storage::object};
        unsigned int            Ls{0};
        bool                    isRegistered{false};
        const std::type_info    *type{nullptr};
        std::string             name;
        int                     module{-1};
        std::set<unsigned int>  owners, properties;
        std::unique_ptr<Object> data{nullptr};
    };
 public:
    // dry run
    void                    dryRun(const bool isDry);
    bool                    isDryRun(void) const;
    // trajectory number
    void                    setTrajectory(const unsigned int traj);
    unsigned int            getTrajectory(void) const;
    // grids
    void                    createGrid(const unsigned int Ls);
    GridCartesian *         getGrid(const unsigned int Ls = 1) const;
    GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const;
    std::vector<int>        getDim(void) const;
    int                     getDim(const unsigned int mu) const;
    unsigned long int       getLocalVolume(void) const;
    unsigned int            getNd(void) const;
    // random number generator
    void                    setSeed(const std::vector<int> &seed);
    GridParallelRNG *       get4dRng(void) const;
    // module management
    void                    pushModule(ModPt &pt);
    template <typename M>
    void                    createModule(const std::string name);
    template <typename M>
    void                    createModule(const std::string name,
                                         const typename M::Par &par);
    void                    createModule(const std::string name,
                                         const std::string type,
                                         XmlReader &reader);
    unsigned int            getNModule(void) const;
    ModuleBase *            getModule(const unsigned int address) const;
    ModuleBase *            getModule(const std::string name) const;
    template <typename M>
    M *                     getModule(const unsigned int address) const;
    template <typename M>
    M *                     getModule(const std::string name) const;
    unsigned int            getModuleAddress(const std::string name) const;
    std::string             getModuleName(const unsigned int address) const;
    std::string             getModuleType(const unsigned int address) const;
    std::string             getModuleType(const std::string name) const;
    std::string             getModuleNamespace(const unsigned int address) const;
    std::string             getModuleNamespace(const std::string name) const;
    bool                    hasModule(const unsigned int address) const;
    bool                    hasModule(const std::string name) const;
    Graph<unsigned int>     makeModuleGraph(void) const;
    Size                    executeProgram(const std::vector<unsigned int> &p);
    Size                    executeProgram(const std::vector<std::string> &p);
    // general memory management
    void                    addObject(const std::string name,
                                      const int moduleAddress = -1);
-    void                    registerObject(const unsigned int address,
+    template <typename B, typename T, typename ... Ts>
-                                           const unsigned int size,
+    void                    createDerivedObject(const std::string name,
-                                           const unsigned int Ls = 1);
+                                                const Environment::Storage storage,
-    void                    registerObject(const std::string name,
+                                                const unsigned int Ls,
-                                           const unsigned int size,
+                                                Ts && ... args);
-                                           const unsigned int Ls = 1);
+    template <typename T, typename ... Ts>
-    template <typename T>
+    void                    createObject(const std::string name,
-    unsigned int            lattice4dSize(void) const;
+                                         const Environment::Storage storage,
-    template <typename T>
+                                         const unsigned int Ls,
-    void                    registerLattice(const unsigned int address,
+                                         Ts && ... args);
-                                            const unsigned int Ls = 1);
+    void                    setObjectModule(const unsigned int objAddress,
-    template <typename T>
+                                            const int modAddress);
    void                    registerLattice(const std::string name,
                                            const unsigned int Ls = 1);
    template <typename T>
    void                    setObject(const unsigned int address, T *object);
    template <typename T>
    void                    setObject(const std::string name, T *object);
    template <typename T>
    T *                     getObject(const unsigned int address) const;
    template <typename T>
    T *                     getObject(const std::string name) const;
-    template <typename T>
+    unsigned int            getMaxAddress(void) const;
    T *                     createLattice(const unsigned int address);
    template <typename T>
    T *                     createLattice(const std::string name);
    unsigned int            getObjectAddress(const std::string name) const;
    std::string             getObjectName(const unsigned int address) const;
    std::string             getObjectType(const unsigned int address) const;
    std::string             getObjectType(const std::string name) const;
    Size                    getObjectSize(const unsigned int address) const;
    Size                    getObjectSize(const std::string name) const;
-    unsigned int            getObjectModule(const unsigned int address) const;
+    Storage                 getObjectStorage(const unsigned int address) const;
-    unsigned int            getObjectModule(const std::string name) const;
+    Storage                 getObjectStorage(const std::string name) const;
    int                     getObjectModule(const unsigned int address) const;
    int                     getObjectModule(const std::string name) const;
    unsigned int            getObjectLs(const unsigned int address) const;
    unsigned int            getObjectLs(const std::string name) const;
    bool                    hasObject(const unsigned int address) const;
    bool                    hasObject(const std::string name) const;
    bool                    hasRegisteredObject(const unsigned int address) const;
    bool                    hasRegisteredObject(const std::string name) const;
    bool                    hasCreatedObject(const unsigned int address) const;
    bool                    hasCreatedObject(const std::string name) const;
    bool                    isObject5d(const unsigned int address) const;
@@ -192,20 +138,17 @@ public:
    template <typename T>
    bool                    isObjectOfType(const std::string name) const;
    Environment::Size       getTotalSize(void) const;
-    void                    addOwnership(const unsigned int owner,
+    void                    freeObject(const unsigned int address);
-                                         const unsigned int property);
+    void                    freeObject(const std::string name);
    void                    addOwnership(const std::string owner,
                                         const std::string property);
    bool                    hasOwners(const unsigned int address) const;
    bool                    hasOwners(const std::string name) const;
    bool                    freeObject(const unsigned int address);
    bool                    freeObject(const std::string name);
    void                    freeAll(void);
-    void                    printContent(void);
+    void                    protectObjects(const bool protect);
    bool                    objectsProtected(void) const;
    // print environment content
    void                    printContent(void) const;
 private:
    // general
-    bool                                   dryRun_{false};
+    unsigned long int                      locVol_;
-    unsigned int                           traj_, locVol_;
+    bool                                   protect_{true};
    // grids
    std::vector<int>                       dim_;
    GridPt                                 grid4d_;
@@ -215,11 +158,6 @@ private:
    unsigned int                           nd_;
    // random number generator
    RngPt                                  rng4d_;
    // module and related maps
    std::vector<ModuleInfo>                module_;
    std::map<std::string, unsigned int>    moduleAddress_;
    // lattice store
    std::map<unsigned int, LatticePt>      lattice_;
    // object store
    std::vector<ObjInfo>                   object_;
    std::map<std::string, unsigned int>    objectAddress_;
@@ -256,96 +194,64 @@ void Holder<T>::reset(T *pt)
 /******************************************************************************
 *                     Environment template implementation                    *
 ******************************************************************************/
-// module management ///////////////////////////////////////////////////////////
+// general memory management ///////////////////////////////////////////////////
-template <typename M>
+template <typename B, typename T, typename ... Ts>
-void Environment::createModule(const std::string name)
+void Environment::createDerivedObject(const std::string name,
                                      const Environment::Storage storage,
                                      const unsigned int Ls,
                                      Ts && ... args)
 {
-    ModPt pt(new M(name));
+    if (!hasObject(name))
-    
+    {
-    pushModule(pt);
+        addObject(name);
    }
-template <typename M>
+    unsigned int address = getObjectAddress(name);
 void Environment::createModule(const std::string name,
                               const typename M::Par &par)
 {
    ModPt pt(new M(name));
-    static_cast<M *>(pt.get())->setPar(par);
+    if (!object_[address].data or !objectsProtected())
-    pushModule(pt);
+    {
-}
+        MemoryStats memStats;
-template <typename M>
+        if (!MemoryProfiler::stats)
 M * Environment::getModule(const unsigned int address) const
        {
-    if (auto *pt = dynamic_cast<M *>(getModule(address)))
+            MemoryProfiler::stats = &memStats;
    {
        return pt;
        }
-    else
+        size_t initMem           = MemoryProfiler::stats->currentlyAllocated;
-    {
+        object_[address].storage = storage;
-        HADRON_ERROR("module '" + module_[address].name
+        object_[address].Ls      = Ls;
-                     + "' does not have type " + typeid(M).name()
+        object_[address].data.reset(new Holder<B>(new T(std::forward<Ts>(args)...)));
-                     + "(object type: " + getModuleType(address) + ")");
+        object_[address].size    = MemoryProfiler::stats->maxAllocated - initMem;
    }
 }
 template <typename M>
 M * Environment::getModule(const std::string name) const
 {
    return getModule<M>(getModuleAddress(name));
 }
 template <typename T>
 unsigned int Environment::lattice4dSize(void) const
 {
    return sizeof(typename T::vector_object)/getGrid()->Nsimd();
 }
 template <typename T>
 void Environment::registerLattice(const unsigned int address,
                                  const unsigned int Ls)
 {
    createGrid(Ls);
    registerObject(address, Ls*lattice4dSize<T>(), Ls);
 }
 template <typename T>
 void Environment::registerLattice(const std::string name, const unsigned int Ls)
 {
    createGrid(Ls);
    registerObject(name, Ls*lattice4dSize<T>(), Ls);
 }
 template <typename T>
 void Environment::setObject(const unsigned int address, T *object)
 {
    if (hasRegisteredObject(address))
    {
        object_[address].data.reset(new Holder<T>(object));
        object_[address].type    = &typeid(T);
-    }
+        if (MemoryProfiler::stats == &memStats)
    else if (hasObject(address))
        {
-        HADRON_ERROR("object with address " + std::to_string(address) +
+            MemoryProfiler::stats = nullptr;
                     " exists but is not registered");
        }
-    else
+    }
    // object already exists, no error if it is a cache, error otherwise
    else if ((object_[address].storage != Storage::cache) or 
             (object_[address].storage != storage)        or
             (object_[address].name    != name)           or
             (object_[address].type    != &typeid(T)))
    {
-        HADRON_ERROR("no object with address " + std::to_string(address));
+        HADRON_ERROR(Definition, "object '" + name + "' already allocated");
    }
 }
-template <typename T>
+template <typename T, typename ... Ts>
-void Environment::setObject(const std::string name, T *object)
+void Environment::createObject(const std::string name, 
                               const Environment::Storage storage,
                               const unsigned int Ls,
                               Ts && ... args)
 {
-    setObject(getObjectAddress(name), object);
+    createDerivedObject<T, T>(name, storage, Ls, std::forward<Ts>(args)...);
 }
 template <typename T>
 T * Environment::getObject(const unsigned int address) const
 {
-    if (hasRegisteredObject(address))
+    if (hasObject(address))
    {
        if (hasCreatedObject(address))
        {
            if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
            {
@@ -353,19 +259,20 @@ T * Environment::getObject(const unsigned int address) const
            }
            else
            {
-            HADRON_ERROR("object with address " + std::to_string(address) +
+                HADRON_ERROR(Definition, "object with address " + std::to_string(address) +
                            " does not have type '" + typeName(&typeid(T)) +
                            "' (has type '" + getObjectType(address) + "')");
            }
        }
-    else if (hasObject(address))
+        else
        {
-        HADRON_ERROR("object with address " + std::to_string(address) +
+            HADRON_ERROR(Definition, "object with address " + std::to_string(address) +
-                     " exists but is not registered");
+                         " is empty");
        }
    }
    else
    {
-        HADRON_ERROR("no object with address " + std::to_string(address));
+        HADRON_ERROR(Definition, "no object with address " + std::to_string(address));
    }
 }
@@ -375,26 +282,10 @@ T * Environment::getObject(const std::string name) const
    return getObject<T>(getObjectAddress(name));
 }
 template <typename T>
 T * Environment::createLattice(const unsigned int address)
 {
    GridCartesian *g = getGrid(getObjectLs(address));
    setObject(address, new T(g));
    return getObject<T>(address);
 }
 template <typename T>
 T * Environment::createLattice(const std::string name)
 {
    return createLattice<T>(getObjectAddress(name));
 }
 template <typename T>
 bool Environment::isObjectOfType(const unsigned int address) const
 {
-    if (hasRegisteredObject(address))
+    if (hasObject(address))
    {
        if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
        {
@@ -405,14 +296,9 @@ bool Environment::isObjectOfType(const unsigned int address) const
            return false;
        }
    }
    else if (hasObject(address))
    {
        HADRON_ERROR("object with address " + std::to_string(address) +
                     " exists but is not registered");
    }
    else
    {
-        HADRON_ERROR("no object with address " + std::to_string(address));
+        HADRON_ERROR(Definition, "no object with address " + std::to_string(address));
    }
 }
--- a/extras/Hadrons/Exceptions.cc
+++ b/extras/Hadrons/Exceptions.cc
@@ -0,0 +1,57 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Exceptions.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Exceptions.hpp>
 #ifndef ERR_SUFF
 #define ERR_SUFF " (" + loc + ")"
 #endif
 #define CONST_EXC(name, init) \
 name::name(std::string msg, std::string loc)\
 :init\
 {}
 using namespace Grid;
 using namespace Hadrons;
 using namespace Exceptions;
 // logic errors
 CONST_EXC(Logic, logic_error(msg + ERR_SUFF))
 CONST_EXC(Definition, Logic("definition error: " + msg, loc))
 CONST_EXC(Implementation, Logic("implementation error: " + msg, loc))
 CONST_EXC(Range, Logic("range error: " + msg, loc))
 CONST_EXC(Size, Logic("size error: " + msg, loc))
 // runtime errors
 CONST_EXC(Runtime, runtime_error(msg + ERR_SUFF))
 CONST_EXC(Argument, Runtime("argument error: " + msg, loc))
 CONST_EXC(Io, Runtime("IO error: " + msg, loc))
 CONST_EXC(Memory, Runtime("memory error: " + msg, loc))
 CONST_EXC(Parsing, Runtime("parsing error: " + msg, loc))
 CONST_EXC(Program, Runtime("program error: " + msg, loc))
 CONST_EXC(System, Runtime("system error: " + msg, loc))
--- a/extras/Hadrons/Exceptions.hpp
+++ b/extras/Hadrons/Exceptions.hpp
@@ -0,0 +1,72 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Exceptions.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Exceptions_hpp_
 #define Hadrons_Exceptions_hpp_
 #include <stdexcept>
 #ifndef Hadrons_Global_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #endif
 #define SRC_LOC std::string(__FUNCTION__) + " at " + std::string(__FILE__) + ":"\
                + std::to_string(__LINE__)
 #define HADRON_ERROR(exc, msg)\
 LOG(Error) << msg << std::endl;\
 throw(Exceptions::exc(msg, SRC_LOC));
 #define DECL_EXC(name, base) \
 class name: public base\
 {\
 public:\
    name(std::string msg, std::string loc);\
 }
 BEGIN_HADRONS_NAMESPACE
 namespace Exceptions
 {
    // logic errors
    DECL_EXC(Logic, std::logic_error);
    DECL_EXC(Definition, Logic);
    DECL_EXC(Implementation, Logic);
    DECL_EXC(Range, Logic);
    DECL_EXC(Size, Logic);
    // runtime errors
    DECL_EXC(Runtime, std::runtime_error);
    DECL_EXC(Argument, Runtime);
    DECL_EXC(Io, Runtime);
    DECL_EXC(Memory, Runtime);
    DECL_EXC(Parsing, Runtime);
    DECL_EXC(Program, Runtime);
    DECL_EXC(System, Runtime);
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Exceptions_hpp_
--- a/extras/Hadrons/Factory.hpp
+++ b/extras/Hadrons/Factory.hpp
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Factory.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -95,7 +94,7 @@ std::unique_ptr<T> Factory<T>::create(const std::string type,
    }
    catch (std::out_of_range &)
    {
-        HADRON_ERROR("object of type '" + type + "' unknown");
+        HADRON_ERROR(Argument, "object of type '" + type + "' unknown");
    }
    return func(name);
--- a/extras/Hadrons/GeneticScheduler.hpp
+++ b/extras/Hadrons/GeneticScheduler.hpp
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/GeneticScheduler.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -38,13 +37,13 @@ BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                   Scheduler based on a genetic algorithm                   *
 ******************************************************************************/
-template <typename T>
+template <typename V, typename T>
 class GeneticScheduler
 {
 public:
    typedef std::vector<T>                 Gene;
    typedef std::pair<Gene *, Gene *>      GenePair;
-    typedef std::function<int(const Gene &)> ObjFunc;
+    typedef std::function<V(const Gene &)> ObjFunc;
    struct Parameters
    {
        double       mutationRate;
@@ -65,7 +64,7 @@ public:
    void benchmarkCrossover(const unsigned int nIt);
    // print population
    friend std::ostream & operator<<(std::ostream &out,
-                                     const GeneticScheduler<T> &s)
+                                     const GeneticScheduler<V, T> &s)
    {
        out << "[";
        for (auto &p: s.population_)
@@ -90,7 +89,7 @@ private:
    Graph<T>               &graph_;
    const ObjFunc          &func_;
    const Parameters       par_;
-    std::multimap<int, Gene> population_;
+    std::multimap<V, Gene> population_;
    std::mt19937           gen_;
 };
@@ -98,8 +97,8 @@ private:
 *                       template implementation                              *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
-template <typename T>
+template <typename V, typename T>
-GeneticScheduler<T>::GeneticScheduler(Graph<T> &graph, const ObjFunc &func,
+GeneticScheduler<V, T>::GeneticScheduler(Graph<T> &graph, const ObjFunc &func,
                                      const Parameters &par)
 : graph_(graph)
 , func_(func)
@@ -109,22 +108,22 @@ GeneticScheduler<T>::GeneticScheduler(Graph<T> &graph, const ObjFunc &func,
 }
 // access //////////////////////////////////////////////////////////////////////
-template <typename T>
+template <typename V, typename T>
-const typename GeneticScheduler<T>::Gene &
+const typename GeneticScheduler<V, T>::Gene &
-GeneticScheduler<T>::getMinSchedule(void)
+GeneticScheduler<V, T>::getMinSchedule(void)
 {
    return population_.begin()->second;
 }
-template <typename T>
+template <typename V, typename T>
-int GeneticScheduler<T>::getMinValue(void)
+int GeneticScheduler<V, T>::getMinValue(void)
 {
    return population_.begin()->first;
 }
 // breed a new generation //////////////////////////////////////////////////////
-template <typename T>
+template <typename V, typename T>
-void GeneticScheduler<T>::nextGeneration(void)
+void GeneticScheduler<V, T>::nextGeneration(void)
 {
    // random initialization of the population if necessary
    if (population_.size() != par_.popSize)
@@ -158,8 +157,8 @@ void GeneticScheduler<T>::nextGeneration(void)
 }
 // evolution steps /////////////////////////////////////////////////////////////
-template <typename T>
+template <typename V, typename T>
-void GeneticScheduler<T>::initPopulation(void)
+void GeneticScheduler<V, T>::initPopulation(void)
 {
    population_.clear();
    for (unsigned int i = 0; i < par_.popSize; ++i)
@@ -170,8 +169,8 @@ void GeneticScheduler<T>::initPopulation(void)
    }
 }
-template <typename T>
+template <typename V, typename T>
-void GeneticScheduler<T>::doCrossover(void)
+void GeneticScheduler<V, T>::doCrossover(void)
 {
    auto p = selectPair();
    Gene &p1 = *(p.first), &p2 = *(p.second);
@@ -185,8 +184,8 @@ void GeneticScheduler<T>::doCrossover(void)
    }
 }
-template <typename T>
+template <typename V, typename T>
-void GeneticScheduler<T>::doMutation(void)
+void GeneticScheduler<V, T>::doMutation(void)
 {
    std::uniform_real_distribution<double>      mdis(0., 1.);
    std::uniform_int_distribution<unsigned int> pdis(0, population_.size() - 1);
@@ -206,40 +205,35 @@ void GeneticScheduler<T>::doMutation(void)
 }
 // genetic operators ///////////////////////////////////////////////////////////
-template <typename T>
+template <typename V, typename T>
-typename GeneticScheduler<T>::GenePair GeneticScheduler<T>::selectPair(void)
+typename GeneticScheduler<V, T>::GenePair GeneticScheduler<V, T>::selectPair(void)
 {
    std::vector<double> prob;
    unsigned int        ind;
    Gene                *p1, *p2;
    const double        max = population_.rbegin()->first;
    for (auto &c: population_)
    {
-        prob.push_back(1./c.first);
+        prob.push_back(std::exp((c.first-1.)/max));
    }        
    do
    {
        double probCpy;
    std::discrete_distribution<unsigned int> dis1(prob.begin(), prob.end());
    auto rIt = population_.begin();
    ind = dis1(gen_);
    std::advance(rIt, ind);
    p1 = &(rIt->second);
        probCpy   = prob[ind];
    prob[ind] = 0.;
    std::discrete_distribution<unsigned int> dis2(prob.begin(), prob.end());
    rIt = population_.begin();
    std::advance(rIt, dis2(gen_));
    p2 = &(rIt->second);
        prob[ind] = probCpy;
    } while (p1 == p2);
    return std::make_pair(p1, p2);
 }
-template <typename T>
+template <typename V, typename T>
-void GeneticScheduler<T>::crossover(Gene &c1, Gene &c2, const Gene &p1,
+void GeneticScheduler<V, T>::crossover(Gene &c1, Gene &c2, const Gene &p1,
                                    const Gene &p2)
 {
    Gene                                        buf;
@@ -273,8 +267,8 @@ void GeneticScheduler<T>::crossover(Gene &c1, Gene &c2, const Gene &p1,
    }
 }
-template <typename T>
+template <typename V, typename T>
-void GeneticScheduler<T>::mutation(Gene &m, const Gene &c)
+void GeneticScheduler<V, T>::mutation(Gene &m, const Gene &c)
 {
    Gene                                        buf;
    std::uniform_int_distribution<unsigned int> dis(0, c.size() - 1);
@@ -303,8 +297,8 @@ void GeneticScheduler<T>::mutation(Gene &m, const Gene &c)
    }
 }
-template <typename T>
+template <typename V, typename T>
-void GeneticScheduler<T>::benchmarkCrossover(const unsigned int nIt)
+void GeneticScheduler<V, T>::benchmarkCrossover(const unsigned int nIt)
 {
    Gene   p1, p2, c1, c2;
    double neg = 0., eq = 0., pos = 0., total;
--- a/extras/Hadrons/Global.cc
+++ b/extras/Hadrons/Global.cc
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Global.cc
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -39,31 +38,19 @@ HadronsLogger Hadrons::HadronsLogMessage(1,"Message");
 HadronsLogger Hadrons::HadronsLogIterative(1,"Iterative");
 HadronsLogger Hadrons::HadronsLogDebug(1,"Debug");
-// pretty size formatting //////////////////////////////////////////////////////
+void Hadrons::initLogger(void)
 std::string Hadrons::sizeString(long unsigned int bytes)
 {
-    constexpr unsigned int bufSize = 256;
+    auto w = std::string("Hadrons").length();
-    const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
+    GridLogError.setTopWidth(w);
-    char                   buf[256];
+    GridLogWarning.setTopWidth(w);
-    long unsigned int      s     = 0;
+    GridLogMessage.setTopWidth(w);
-    double                 count = bytes;
+    GridLogIterative.setTopWidth(w);
-    
+    GridLogDebug.setTopWidth(w);
-    while (count >= 1024 && s < 7)
+    HadronsLogError.Active(GridLogError.isActive());
-    {
+    HadronsLogWarning.Active(GridLogWarning.isActive());
-        s++;
+    HadronsLogMessage.Active(GridLogMessage.isActive());
-        count /= 1024;
+    HadronsLogIterative.Active(GridLogIterative.isActive());
-    }
+    HadronsLogDebug.Active(GridLogDebug.isActive());
    if (count - floor(count) == 0.0)
    {
        snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
    }
    else
    {
        snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
    }
    return std::string(buf);
 }
 // type utilities //////////////////////////////////////////////////////////////
@@ -80,3 +67,10 @@ std::string Hadrons::typeName(const std::type_info *info)
    return name;
 }
 // default writers/readers /////////////////////////////////////////////////////
 #ifdef HAVE_HDF5
 const std::string Hadrons::resultFileExt = "h5";
 #else
 const std::string Hadrons::resultFileExt = "xml";
 #endif
--- a/extras/Hadrons/Global.hpp
+++ b/extras/Hadrons/Global.hpp
@@ -4,10 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Global.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -35,6 +35,10 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Grid.h>
 #include <cxxabi.h>
 #ifndef SITE_SIZE_TYPE
 #define SITE_SIZE_TYPE size_t
 #endif
 #define BEGIN_HADRONS_NAMESPACE \
 namespace Grid {\
 using namespace QCD;\
@@ -57,6 +61,9 @@ using Grid::operator<<;
 #ifndef SIMPL
 #define SIMPL ScalarImplCR
 #endif
 #ifndef GIMPL
 #define GIMPL GimplTypesR
 #endif
 BEGIN_HADRONS_NAMESPACE
@@ -65,9 +72,8 @@ BEGIN_HADRONS_NAMESPACE
 typedef FermionOperator<FImpl>                        FMat##suffix;            \
 typedef typename FImpl::FermionField                  FermionField##suffix;    \
 typedef typename FImpl::PropagatorField               PropagatorField##suffix; \
-typedef typename FImpl::SitePropagator               SitePropagator##suffix;   \
+typedef typename FImpl::SitePropagator::scalar_object SitePropagator##suffix;  \
-typedef std::vector<typename FImpl::SitePropagator::scalar_object>             \
+typedef std::vector<SitePropagator##suffix>           SlicedPropagator##suffix;
                                                     SlicedPropagator##suffix;
 #define GAUGE_TYPE_ALIASES(FImpl, suffix)\
 typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
@@ -81,7 +87,8 @@ typedef std::function<void(FermionField##suffix &,\
                      const FermionField##suffix &)> SolverFn##suffix;
 #define SINK_TYPE_ALIASES(suffix)\
-typedef std::function<SlicedPropagator##suffix(const PropagatorField##suffix &)> SinkFn##suffix;
+typedef std::function<SlicedPropagator##suffix\
                      (const PropagatorField##suffix &)> SinkFn##suffix;
 #define FGS_TYPE_ALIASES(FImpl, suffix)\
 FERM_TYPE_ALIASES(FImpl, suffix)\
@@ -97,11 +104,6 @@ public:
 };
 #define LOG(channel) std::cout << HadronsLog##channel
 #define HADRON_ERROR(msg)\
 LOG(Error) << msg << " (" << __FUNCTION__ << " at " << __FILE__ << ":"\
           << __LINE__ << ")" << std::endl;\
 abort();
 #define DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
 extern HadronsLogger HadronsLogError;
@@ -110,6 +112,8 @@ extern HadronsLogger HadronsLogMessage;
 extern HadronsLogger HadronsLogIterative;
 extern HadronsLogger HadronsLogDebug;
 void initLogger(void);
 // singleton pattern
 #define SINGLETON(name)\
 public:\
@@ -135,9 +139,6 @@ public:\
 private:\
    name(void) = default;
 // pretty size formating
 std::string sizeString(long unsigned int bytes);
 // type utilities
 template <typename T>
 const std::type_info * typeIdPt(const T &x)
@@ -166,14 +167,21 @@ std::string typeName(void)
 }
 // default writers/readers
 extern const std::string resultFileExt;
 #ifdef HAVE_HDF5
-typedef Hdf5Reader CorrReader;
+typedef Hdf5Reader ResultReader;
-typedef Hdf5Writer CorrWriter;
+typedef Hdf5Writer ResultWriter;
 #else
-typedef XmlReader CorrReader;
+typedef XmlReader ResultReader;
-typedef XmlWriter CorrWriter;
+typedef XmlWriter ResultWriter;
 #endif
 #define RESULT_FILE_NAME(name) \
 name + "." + std::to_string(vm().getTrajectory()) + "." + resultFileExt
 END_HADRONS_NAMESPACE
 #include <Grid/Hadrons/Exceptions.hpp>
 #endif // Hadrons_Global_hpp_
--- a/extras/Hadrons/Graph.hpp
+++ b/extras/Hadrons/Graph.hpp
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Graph.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -185,7 +184,7 @@ void Graph<T>::removeVertex(const T &value)
    }
    else
    {
-        HADRON_ERROR("vertex " << value << " does not exists");
+        HADRON_ERROR(Range, "vertex does not exists");
    }
    // remove all edges containing the vertex
@@ -214,7 +213,7 @@ void Graph<T>::removeEdge(const Edge &e)
    }
    else
    {
-        HADRON_ERROR("edge "  << e << " does not exists");
+        HADRON_ERROR(Range, "edge does not exists");
    }
 }
@@ -260,7 +259,7 @@ void Graph<T>::mark(const T &value, const bool doMark)
    }
    else
    {
-        HADRON_ERROR("vertex " << value << " does not exists");
+        HADRON_ERROR(Range, "vertex does not exists");
    }
 }
@@ -298,7 +297,7 @@ bool Graph<T>::isMarked(const T &value) const
    }
    else
    {
-        HADRON_ERROR("vertex " << value << " does not exists");
+        HADRON_ERROR(Range, "vertex does not exists");
        return false;
    }
@@ -430,7 +429,7 @@ std::vector<T> Graph<T>::getAdjacentVertices(const T &value) const
    {
        return ((e.first == value) or (e.second == value));
    };
-    auto eIt = find_if(edgeSet_.begin(), edgeSet_.end(), pred);
+    auto eIt = std::find_if(edgeSet_.begin(), edgeSet_.end(), pred);
    while (eIt != edgeSet_.end())
    {
@@ -442,7 +441,7 @@ std::vector<T> Graph<T>::getAdjacentVertices(const T &value) const
        {
            adjacentVertex.push_back((*eIt).first);
        }
-        eIt = find_if(++eIt, edgeSet_.end(), pred);
+        eIt = std::find_if(++eIt, edgeSet_.end(), pred);
    }
    return adjacentVertex;
@@ -458,12 +457,12 @@ std::vector<T> Graph<T>::getChildren(const T &value) const
    {
        return (e.first == value);
    };
-    auto eIt = find_if(edgeSet_.begin(), edgeSet_.end(), pred);
+    auto eIt = std::find_if(edgeSet_.begin(), edgeSet_.end(), pred);
    while (eIt != edgeSet_.end())
    {
        child.push_back((*eIt).second);
-        eIt = find_if(++eIt, edgeSet_.end(), pred);
+        eIt = std::find_if(++eIt, edgeSet_.end(), pred);
    }
    return child;
@@ -479,12 +478,12 @@ std::vector<T> Graph<T>::getParents(const T &value) const
    {
        return (e.second == value);
    };
-    auto eIt = find_if(edgeSet_.begin(), edgeSet_.end(), pred);
+    auto eIt = std::find_if(edgeSet_.begin(), edgeSet_.end(), pred);
    while (eIt != edgeSet_.end())
    {
        parent.push_back((*eIt).first);
-        eIt = find_if(++eIt, edgeSet_.end(), pred);
+        eIt = std::find_if(++eIt, edgeSet_.end(), pred);
    }
    return parent;
@@ -544,7 +543,7 @@ std::vector<T> Graph<T>::topoSort(void)
    {
        if (tmpMarked.at(v))
        {
-            HADRON_ERROR("cannot topologically sort a cyclic graph");
+            HADRON_ERROR(Range, "cannot topologically sort a cyclic graph");
        }
        if (!isMarked(v))
        {
@@ -603,7 +602,7 @@ std::vector<T> Graph<T>::topoSort(Gen &gen)
    {
        if (tmpMarked.at(v))
        {
-            HADRON_ERROR("cannot topologically sort a cyclic graph");
+            HADRON_ERROR(Range, "cannot topologically sort a cyclic graph");
        }
        if (!isMarked(v))
        {
--- a/extras/Hadrons/HadronsXmlRun.cc
+++ b/extras/Hadrons/HadronsXmlRun.cc
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/HadronsXmlRun.cc
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -55,12 +54,6 @@ int main(int argc, char *argv[])
    // initialization
    Grid_init(&argc, &argv);
    HadronsLogError.Active(GridLogError.isActive());
    HadronsLogWarning.Active(GridLogWarning.isActive());
    HadronsLogMessage.Active(GridLogMessage.isActive());
    HadronsLogIterative.Active(GridLogIterative.isActive());
    HadronsLogDebug.Active(GridLogDebug.isActive());
    LOG(Message) << "Grid initialized" << std::endl;
    // execution
    Application application(parameterFileName);
--- a/extras/Hadrons/HadronsXmlSchedule.cc
+++ b/extras/Hadrons/HadronsXmlSchedule.cc
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/HadronsXmlSchedule.cc
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -49,12 +48,6 @@ int main(int argc, char *argv[])
    // initialization
    Grid_init(&argc, &argv);
    HadronsLogError.Active(GridLogError.isActive());
    HadronsLogWarning.Active(GridLogWarning.isActive());
    HadronsLogMessage.Active(GridLogMessage.isActive());
    HadronsLogIterative.Active(GridLogIterative.isActive());
    HadronsLogDebug.Active(GridLogDebug.isActive());
    LOG(Message) << "Grid initialized" << std::endl;
    // execution
    Application application;
--- a/extras/Hadrons/Makefile.am
+++ b/extras/Hadrons/Makefile.am
@@ -7,20 +7,24 @@ libHadrons_a_SOURCES = \
    $(modules_cc)      \
    Application.cc     \
    Environment.cc     \
 	Exceptions.cc      \
    Global.cc          \
-    Module.cc
+    Module.cc		   \
 	VirtualMachine.cc
 libHadrons_adir = $(pkgincludedir)/Hadrons
 nobase_libHadrons_a_HEADERS = \
 	$(modules_hpp)            \
 	Application.hpp           \
 	Environment.hpp           \
 	Exceptions.hpp            \
 	Factory.hpp               \
 	GeneticScheduler.hpp      \
 	Global.hpp                \
 	Graph.hpp                 \
 	Module.hpp                \
 	Modules.hpp               \
-	ModuleFactory.hpp
+	ModuleFactory.hpp         \
 	VirtualMachine.hpp
 HadronsXmlRun_SOURCES = HadronsXmlRun.cc
 HadronsXmlRun_LDADD   = libHadrons.a -lGrid
--- a/extras/Hadrons/Module.cc
+++ b/extras/Hadrons/Module.cc
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Module.cc
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -39,7 +38,6 @@ using namespace Hadrons;
 // constructor /////////////////////////////////////////////////////////////////
 ModuleBase::ModuleBase(const std::string name)
 : name_(name)
 , env_(Environment::getInstance())
 {}
 // access //////////////////////////////////////////////////////////////////////
@@ -48,15 +46,10 @@ std::string ModuleBase::getName(void) const
    return name_;
 }
 Environment & ModuleBase::env(void) const
 {
    return env_;
 }
 // get factory registration name if available
 std::string ModuleBase::getRegisteredName(void)
 {
-    HADRON_ERROR("module '" + getName() + "' has a type not registered"
+    HADRON_ERROR(Definition, "module '" + getName() + "' has no registered type"
                 + " in the factory");
 }
@@ -64,8 +57,5 @@ std::string ModuleBase::getRegisteredName(void)
 void ModuleBase::operator()(void)
 {
    setup();
    if (!env().isDryRun())
    {
    execute();
 }
 }
--- a/extras/Hadrons/Module.hpp
+++ b/extras/Hadrons/Module.hpp
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Module.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -31,7 +30,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define Hadrons_Module_hpp_
 #include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Environment.hpp>
+#include <Grid/Hadrons/VirtualMachine.hpp>
 BEGIN_HADRONS_NAMESPACE
@@ -87,6 +86,56 @@ public:\
 static ns##mod##ModuleRegistrar ns##mod##ModuleRegistrarInstance;
 #define ARG(...) __VA_ARGS__
 #define MACRO_REDIRECT(arg1, arg2, arg3, macro, ...) macro
 #define envGet(type, name)\
 *env().template getObject<type>(name)
 #define envGetTmp(type, var)\
 type &var = *env().template getObject<type>(getName() + "_tmp_" + #var)
 #define envHasType(type, name)\
 env().template isObjectOfType<type>(name)
 #define envCreate(type, name, Ls, ...)\
 env().template createObject<type>(name, Environment::Storage::object, Ls, __VA_ARGS__)
 #define envCreateDerived(base, type, name, Ls, ...)\
 env().template createDerivedObject<base, type>(name, Environment::Storage::object, Ls, __VA_ARGS__)
 #define envCreateLat4(type, name)\
 envCreate(type, name, 1, env().getGrid())
 #define envCreateLat5(type, name, Ls)\
 envCreate(type, name, Ls, env().getGrid(Ls))
 #define envCreateLat(...)\
 MACRO_REDIRECT(__VA_ARGS__, envCreateLat5, envCreateLat4)(__VA_ARGS__)
 #define envCache(type, name, Ls, ...)\
 env().template createObject<type>(name, Environment::Storage::cache, Ls, __VA_ARGS__)
 #define envCacheLat4(type, name)\
 envCache(type, name, 1, env().getGrid())
 #define envCacheLat5(type, name, Ls)\
 envCache(type, name, Ls, env().getGrid(Ls))
 #define envCacheLat(...)\
 MACRO_REDIRECT(__VA_ARGS__, envCacheLat5, envCacheLat4)(__VA_ARGS__)
 #define envTmp(type, name, Ls, ...)\
 env().template createObject<type>(getName() + "_tmp_" + name,         \
                                  Environment::Storage::temporary, Ls, __VA_ARGS__)
 #define envTmpLat4(type, name)\
 envTmp(type, name, 1, env().getGrid())
 #define envTmpLat5(type, name, Ls)\
 envTmp(type, name, Ls, env().getGrid(Ls))
 #define envTmpLat(...)\
 MACRO_REDIRECT(__VA_ARGS__, envTmpLat5, envTmpLat4)(__VA_ARGS__)
 /******************************************************************************
 *                            Module class                                    *
@@ -101,23 +150,30 @@ public:
    virtual ~ModuleBase(void) = default;
    // access
    std::string getName(void) const;
    Environment &env(void) const;
    // get factory registration name if available
    virtual std::string getRegisteredName(void);
    // dependencies/products
    virtual std::vector<std::string> getInput(void) = 0;
    virtual std::vector<std::string> getReference(void)
    {
        return std::vector<std::string>(0);
    };
    virtual std::vector<std::string> getOutput(void) = 0;
    // parse parameters
    virtual void parseParameters(XmlReader &reader, const std::string name) = 0;
    virtual void saveParameters(XmlWriter &writer, const std::string name) = 0;
    // setup
    virtual void setup(void) {};
    virtual void execute(void) = 0;
    // execution
    void operator()(void);
-    virtual void execute(void) = 0;
+protected:
    // environment shortcut
    DEFINE_ENV_ALIAS;
    // virtual machine shortcut
    DEFINE_VM_ALIAS;
 private:
    std::string name_;
    Environment &env_;
 };
 // derived class, templating the parameter class
--- a/extras/Hadrons/ModuleFactory.hpp
+++ b/extras/Hadrons/ModuleFactory.hpp
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/ModuleFactory.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@@ -1,25 +1,65 @@
-#include <Grid/Hadrons/Modules/MAction/DWF.hpp>
+/*************************************************************************************
-#include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
+
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MContraction/Baryon.hpp>
 #include <Grid/Hadrons/Modules/MContraction/DiscLoop.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Gamma3pt.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Meson.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/DiscLoop.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Gamma3pt.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WardIdentity.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 #include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Load.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 #include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 #include <Grid/Hadrons/Modules/MSink/Point.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MSource/Point.hpp>
 #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
 #include <Grid/Hadrons/Modules/MSource/Point.hpp>
 #include <Grid/Hadrons/Modules/MSource/Wall.hpp>
 #include <Grid/Hadrons/Modules/MSource/Z2.hpp>
 #include <Grid/Hadrons/Modules/MSource/SeqConserved.hpp>
 #include <Grid/Hadrons/Modules/MSink/Smear.hpp>
 #include <Grid/Hadrons/Modules/MSink/Point.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Grid/Hadrons/Modules/MGauge/FundtoHirep.hpp>
 #include <Grid/Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
 #include <Grid/Hadrons/Modules/MUtilities/TestSeqConserved.hpp>
 #include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MAction/DWF.hpp>
 #include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
 #include <Grid/Hadrons/Modules/MAction/WilsonClover.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/Div.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/TrMag.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/TrPhi.hpp>
 #include <Grid/Hadrons/Modules/MIO/LoadNersc.hpp>
 #include <Grid/Hadrons/Modules/MIO/LoadBinary.hpp>
--- a/extras/Hadrons/Modules/MAction/DWF.hpp
+++ b/extras/Hadrons/Modules/MAction/DWF.hpp
@@ -4,10 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MAction/DWF.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -65,6 +65,7 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
@@ -102,16 +103,6 @@ std::vector<std::string> TDWF<FImpl>::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TDWF<FImpl>::setup(void)
 {
    unsigned int size;
    size = 2*env().template lattice4dSize<typename FImpl::DoubledGaugeField>();
    env().registerObject(getName(), size, par().Ls);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TDWF<FImpl>::execute(void)
 {
    LOG(Message) << "Setting up domain wall fermion matrix with m= "
                 << par().mass << ", M5= " << par().M5 << " and Ls= "
@@ -119,20 +110,24 @@ void TDWF<FImpl>::execute(void)
                 << std::endl;
    LOG(Message) << "Fermion boundary conditions: " << par().boundary
                 << std::endl;
    env().createGrid(par().Ls);
-    auto &U      = *env().template getObject<LatticeGaugeField>(par().gauge);
+    auto &U    = envGet(LatticeGaugeField, par().gauge);
    auto &g4   = *env().getGrid();
    auto &grb4 = *env().getRbGrid();
    auto &g5   = *env().getGrid(par().Ls);
    auto &grb5 = *env().getRbGrid(par().Ls);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename DomainWallFermion<FImpl>::ImplParams implParams(boundary);
-    FMat *fMatPt = new DomainWallFermion<FImpl>(U, g5, grb5, g4, grb4,
+    envCreateDerived(FMat, DomainWallFermion<FImpl>, getName(), par().Ls, U, g5,
-                                                par().mass, par().M5,
+                     grb5, g4, grb4, par().mass, par().M5, implParams);
                                                implParams);
    env().setObject(getName(), fMatPt);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TDWF<FImpl>::execute(void)
 {}
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
--- a/extras/Hadrons/Modules/MAction/Wilson.hpp
+++ b/extras/Hadrons/Modules/MAction/Wilson.hpp
@@ -4,10 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MAction/Wilson.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -63,6 +63,7 @@ public:
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
@@ -101,29 +102,24 @@ std::vector<std::string> TWilson<FImpl>::getOutput(void)
 template <typename FImpl>
 void TWilson<FImpl>::setup(void)
 {
-    unsigned int size;
+    LOG(Message) << "Setting up TWilson fermion matrix with m= " << par().mass
                 << " using gauge field '" << par().gauge << "'" << std::endl;
    LOG(Message) << "Fermion boundary conditions: " << par().boundary
                 << std::endl;
-    size = 2*env().template lattice4dSize<typename FImpl::DoubledGaugeField>();
+    auto &U      = envGet(LatticeGaugeField, par().gauge);
-    env().registerObject(getName(), size);
+    auto &grid   = *env().getGrid();
    auto &gridRb = *env().getRbGrid();
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename WilsonFermion<FImpl>::ImplParams implParams(boundary);
    envCreateDerived(FMat, WilsonFermion<FImpl>, getName(), 1, U, grid, gridRb,
                     par().mass, implParams);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWilson<FImpl>::execute()
-{
+{}
    LOG(Message) << "Setting up TWilson fermion matrix with m= " << par().mass
                 << " using gauge field '" << par().gauge << "'" << std::endl;
    LOG(Message) << "Fermion boundary conditions: " << par().boundary 
                 << std::endl;
    auto &U      = *env().template getObject<LatticeGaugeField>(par().gauge);
    auto &grid   = *env().getGrid();
    auto &gridRb = *env().getRbGrid();
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename WilsonFermion<FImpl>::ImplParams implParams(boundary);
    FMat *fMatPt = new WilsonFermion<FImpl>(U, grid, gridRb, par().mass,
                                            implParams);
    env().setObject(getName(), fMatPt);
 }
 END_MODULE_NAMESPACE
--- a/extras/Hadrons/Modules/MAction/WilsonClover.hpp
+++ b/extras/Hadrons/Modules/MAction/WilsonClover.hpp
@@ -0,0 +1,153 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MAction/Wilson.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MAction_WilsonClover_hpp_
 #define Hadrons_MAction_WilsonClover_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                            TWilson quark action                            *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MAction)
 class WilsonCloverPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonCloverPar,
                                    std::string, gauge,
                                    double     , mass,
 				                    double     , csw_r,
 				                    double     , csw_t,
 				                    WilsonAnisotropyCoefficients ,clover_anisotropy,
                                    std::string, boundary
 				    );
 };
 template <typename FImpl>
 class TWilsonClover: public Module<WilsonCloverPar>
 {
 public:
    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWilsonClover(const std::string name);
    // destructor
    virtual ~TWilsonClover(void) = default;
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(WilsonClover, TWilsonClover<FIMPL>, MAction);
 /******************************************************************************
 *                     TWilsonClover template implementation                        *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TWilsonClover<FImpl>::TWilsonClover(const std::string name)
 : Module<WilsonCloverPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TWilsonClover<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TWilsonClover<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWilsonClover<FImpl>::setup(void)
 {
    //unsigned int size;
    // size = 2*env().template lattice4dSize<typename FImpl::DoubledGaugeField>();
    // env().registerObject(getName(), size);
    LOG(Message) << "Setting up TWilsonClover fermion matrix with m= " << par().mass
                 << " using gauge field '" << par().gauge << "'" << std::endl;
    LOG(Message) << "Fermion boundary conditions: " << par().boundary 
                 << std::endl;
    LOG(Message) << "Clover term csw_r: " << par().csw_r
                 << " csw_t: " << par().csw_t
                 << std::endl;
    auto &U      = envGet(LatticeGaugeField, par().gauge);
    auto &grid   = *env().getGrid();
    auto &gridRb = *env().getRbGrid();
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename WilsonCloverFermion<FImpl>::ImplParams implParams(boundary);
    envCreateDerived(FMat, WilsonCloverFermion<FImpl>, getName(), 1, U, grid, gridRb, par().mass,
 						  par().csw_r,
 						  par().csw_t,
 					      par().clover_anisotropy,
 						  implParams); 
    //FMat *fMatPt = new WilsonCloverFermion<FImpl>(U, grid, gridRb, par().mass,
 	//					  par().csw_r,
 	//					  par().csw_t,
 	//				      par().clover_anisotropy,
 	//					  implParams);
    //env().setObject(getName(), fMatPt);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWilsonClover<FImpl>::execute()
 {
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_WilsonClover_hpp_
--- a/extras/Hadrons/Modules/MContraction/Baryon.hpp
+++ b/extras/Hadrons/Modules/MContraction/Baryon.hpp
@@ -4,10 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MContraction/Baryon.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -72,6 +72,9 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
@@ -99,11 +102,18 @@ std::vector<std::string> TBaryon<FImpl1, FImpl2, FImpl3>::getInput(void)
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 std::vector<std::string> TBaryon<FImpl1, FImpl2, FImpl3>::getOutput(void)
 {
-    std::vector<std::string> out = {getName()};
+    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TBaryon<FImpl1, FImpl2, FImpl3>::setup(void)
 {
    envTmpLat(LatticeComplex, "c");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
@@ -112,11 +122,11 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
                 << " quarks '" << par().q1 << "', '" << par().q2 << "', and '"
                 << par().q3 << "'" << std::endl;
-    CorrWriter             writer(par().output);
+    ResultWriter writer(RESULT_FILE_NAME(par().output));
-    PropagatorField1      &q1 = *env().template getObject<PropagatorField1>(par().q1);
+    auto       &q1 = envGet(PropagatorField1, par().q1);
-    PropagatorField2      &q2 = *env().template getObject<PropagatorField2>(par().q2);
+    auto       &q2 = envGet(PropagatorField2, par().q2);
-    PropagatorField3      &q3 = *env().template getObject<PropagatorField3>(par().q2);
+    auto       &q3 = envGet(PropagatorField3, par().q2);
-    LatticeComplex        c(env().getGrid());
+    envGetTmp(LatticeComplex, c);
    Result     result;
    // FIXME: do contractions
--- a/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
+++ b/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
@@ -4,9 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MContraction/DiscLoop.hpp
-Copyright (C) 2017
+Copyright (C) 2015-2018
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -68,6 +69,7 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
@@ -97,7 +99,7 @@ std::vector<std::string> TDiscLoop<FImpl>::getInput(void)
 template <typename FImpl>
 std::vector<std::string> TDiscLoop<FImpl>::getOutput(void)
 {
-    std::vector<std::string> out = {getName()};
+    std::vector<std::string> out = {};
    return out;
 }
@@ -106,7 +108,7 @@ std::vector<std::string> TDiscLoop<FImpl>::getOutput(void)
 template <typename FImpl>
 void TDiscLoop<FImpl>::setup(void)
 {
-    
+    envTmpLat(LatticeComplex, "c");
 }
 // execution ///////////////////////////////////////////////////////////////////
@@ -117,13 +119,13 @@ void TDiscLoop<FImpl>::execute(void)
                 << "' using '" << par().q_loop << "' with " << par().gamma 
                 << " insertion." << std::endl;
-    CorrWriter            writer(par().output);
+    ResultWriter          writer(RESULT_FILE_NAME(par().output));
-    PropagatorField       &q_loop = *env().template getObject<PropagatorField>(par().q_loop);
+    auto                  &q_loop = envGet(PropagatorField, par().q_loop);
    LatticeComplex        c(env().getGrid());
    Gamma                 gamma(par().gamma);
    std::vector<TComplex> buf;
    Result                result;
    envGetTmp(LatticeComplex, c);
    c = trace(gamma*q_loop);
    sliceSum(c, buf, Tp);
--- a/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
+++ b/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
@@ -4,9 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
-Copyright (C) 2017
+Copyright (C) 2015-2018
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -51,6 +52,14 @@ BEGIN_HADRONS_NAMESPACE
 *                   q1
 *
 *      trace(g5*q1*adj(q2)*g5*gamma*q3)
 * 
 *  options:
 *   - q1: sink smeared propagator, source at i
 *   - q2: propagator, source at i
 *   - q3: propagator, source at f
 *   - gamma: gamma matrix to insert
 *   - tSnk: sink position for propagator q1.
 *
 */
 /******************************************************************************
@@ -66,6 +75,7 @@ public:
                                    std::string,    q2,
                                    std::string,    q3,
                                    Gamma::Algebra, gamma,
                                    unsigned int,   tSnk,
                                    std::string,    output);
 };
@@ -90,6 +100,7 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
@@ -119,7 +130,7 @@ std::vector<std::string> TGamma3pt<FImpl1, FImpl2, FImpl3>::getInput(void)
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 std::vector<std::string> TGamma3pt<FImpl1, FImpl2, FImpl3>::getOutput(void)
 {
-    std::vector<std::string> out = {getName()};
+    std::vector<std::string> out = {};
    return out;
 }
@@ -128,7 +139,7 @@ std::vector<std::string> TGamma3pt<FImpl1, FImpl2, FImpl3>::getOutput(void)
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TGamma3pt<FImpl1, FImpl2, FImpl3>::setup(void)
 {
-    
+    envTmpLat(LatticeComplex, "c");
 }
 // execution ///////////////////////////////////////////////////////////////////
@@ -140,17 +151,22 @@ void TGamma3pt<FImpl1, FImpl2, FImpl3>::execute(void)
                 << par().q3 << "', with " << par().gamma << " insertion." 
                 << std::endl;
-    CorrWriter            writer(par().output);
+    // Initialise variables. q2 and q3 are normal propagators, q1 may be 
-    PropagatorField1      &q1 = *env().template getObject<PropagatorField1>(par().q1);
+    // sink smeared.
-    PropagatorField2      &q2 = *env().template getObject<PropagatorField2>(par().q2);
+    ResultWriter          writer(RESULT_FILE_NAME(par().output));
-    PropagatorField3      &q3 = *env().template getObject<PropagatorField3>(par().q3);
+    auto                  &q1 = envGet(SlicedPropagator1, par().q1);
-    LatticeComplex        c(env().getGrid());
+    auto                  &q2 = envGet(PropagatorField2, par().q2);
    auto                  &q3 = envGet(PropagatorField2, par().q3);
    Gamma                 g5(Gamma::Algebra::Gamma5);
    Gamma                 gamma(par().gamma);
    std::vector<TComplex> buf;
    Result                result;
-    c = trace(g5*q1*adj(q2)*(g5*gamma)*q3);
+    // Extract relevant timeslice of sinked propagator q1, then contract &
    // sum over all spacial positions of gamma insertion.
    SitePropagator1 q1Snk = q1[par().tSnk];
    envGetTmp(LatticeComplex, c);
    c = trace(g5*q1Snk*adj(q2)*(g5*gamma)*q3);
    sliceSum(c, buf, Tp);
    result.gamma = par().gamma;
--- a/extras/Hadrons/Modules/MContraction/Meson.hpp
+++ b/extras/Hadrons/Modules/MContraction/Meson.hpp
@@ -4,12 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MContraction/Meson.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Copyright (C) 2017
 Author: Antonin Portelli <antonin.portelli@me.com>
-        Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -51,8 +49,7 @@ BEGIN_HADRONS_NAMESPACE
           in a sequence (e.g. "<Gamma5 Gamma5><Gamma5 GammaT>").
           Special values: "all" - perform all possible contractions.
- - mom: momentum insertion, space-separated float sequence (e.g ".1 .2 1. 0."),
+ - sink: module to compute the sink to use in contraction (string).
        given as multiples of (2*pi) / L.
 */
 /******************************************************************************
@@ -98,6 +95,9 @@ public:
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    virtual void parseGammaString(std::vector<GammaPair> &gammaList);
 protected:
    // execution
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
@@ -125,7 +125,7 @@ std::vector<std::string> TMeson<FImpl1, FImpl2>::getInput(void)
 template <typename FImpl1, typename FImpl2>
 std::vector<std::string> TMeson<FImpl1, FImpl2>::getOutput(void)
 {
-    std::vector<std::string> output = {getName()};
+    std::vector<std::string> output = {};
    return output;
 }
@@ -154,6 +154,12 @@ void TMeson<FImpl1, FImpl2>::parseGammaString(std::vector<GammaPair> &gammaList)
    } 
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2>
 void TMeson<FImpl1, FImpl2>::setup(void)
 {
    envTmpLat(LatticeComplex, "c");
 }
 // execution ///////////////////////////////////////////////////////////////////
 #define mesonConnected(q1, q2, gSnk, gSrc) \
@@ -166,7 +172,7 @@ void TMeson<FImpl1, FImpl2>::execute(void)
                 << " quarks '" << par().q1 << "' and '" << par().q2 << "'"
                 << std::endl;
-    CorrWriter             writer(par().output);
+    ResultWriter           writer(RESULT_FILE_NAME(par().output));
    std::vector<TComplex>  buf;
    std::vector<Result>    result;
    Gamma                  g5(Gamma::Algebra::Gamma5);
@@ -181,11 +187,11 @@ void TMeson<FImpl1, FImpl2>::execute(void)
        result[i].gamma_src = gammaList[i].second;
        result[i].corr.resize(nt);
    }
-    if (env().template isObjectOfType<SlicedPropagator1>(par().q1) and
+    if (envHasType(SlicedPropagator1, par().q1) and
-        env().template isObjectOfType<SlicedPropagator2>(par().q2))
+        envHasType(SlicedPropagator2, par().q2))
    {
-        SlicedPropagator1 &q1 = *env().template getObject<SlicedPropagator1>(par().q1);
+        auto &q1 = envGet(SlicedPropagator1, par().q1);
-        SlicedPropagator2 &q2 = *env().template getObject<SlicedPropagator2>(par().q2);
+        auto &q2 = envGet(SlicedPropagator2, par().q2);
        LOG(Message) << "(propagator already sinked)" << std::endl;
        for (unsigned int i = 0; i < result.size(); ++i)
@@ -201,10 +207,10 @@ void TMeson<FImpl1, FImpl2>::execute(void)
    }
    else
    {
-        PropagatorField1 &q1   = *env().template getObject<PropagatorField1>(par().q1);
+        auto &q1 = envGet(PropagatorField1, par().q1);
-        PropagatorField2 &q2   = *env().template getObject<PropagatorField2>(par().q2);
+        auto &q2 = envGet(PropagatorField2, par().q2);
        LatticeComplex   c(env().getGrid());
        envGetTmp(LatticeComplex, c);
        LOG(Message) << "(using sink '" << par().sink << "')" << std::endl;
        for (unsigned int i = 0; i < result.size(); ++i)
        {
@@ -212,18 +218,17 @@ void TMeson<FImpl1, FImpl2>::execute(void)
            Gamma       gSrc(gammaList[i].second);
            std::string ns;
-            ns = env().getModuleNamespace(env().getObjectModule(par().sink));
+            ns = vm().getModuleNamespace(env().getObjectModule(par().sink));
            if (ns == "MSource")
            {
-                PropagatorField1 &sink =
+                PropagatorField1 &sink = envGet(PropagatorField1, par().sink);
                    *env().template getObject<PropagatorField1>(par().sink);
                c = trace(mesonConnected(q1, q2, gSnk, gSrc)*sink);
                sliceSum(c, buf, Tp);
            }
            else if (ns == "MSink")
            {
-                SinkFnScalar &sink = *env().template getObject<SinkFnScalar>(par().sink);
+                SinkFnScalar &sink = envGet(SinkFnScalar, par().sink);
                c   = trace(mesonConnected(q1, q2, gSnk, gSrc));
                buf = sink(c);
--- a/extras/Hadrons/Modules/MContraction/WardIdentity.hpp
+++ b/extras/Hadrons/Modules/MContraction/WardIdentity.hpp
@@ -0,0 +1,224 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WardIdentity.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WardIdentity_hpp_
 #define Hadrons_MContraction_WardIdentity_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
  Ward Identity contractions
 -----------------------------
 * options:
 - q:          propagator, 5D if available (string)
 - action:     action module used for propagator solution (string)
 - mass:       mass of quark (double)
 - test_axial: whether or not to test PCAC relation.
 */
 /******************************************************************************
 *                              WardIdentity                                  *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 class WardIdentityPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WardIdentityPar,
                                    std::string, q,
                                    std::string, action,
                                    double,      mass,
                                    bool,        test_axial);
 };
 template <typename FImpl>
 class TWardIdentity: public Module<WardIdentityPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWardIdentity(const std::string name);
    // destructor
    virtual ~TWardIdentity(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    unsigned int Ls_;
 };
 MODULE_REGISTER_NS(WardIdentity, TWardIdentity<FIMPL>, MContraction);
 /******************************************************************************
 *                     TWardIdentity implementation                           *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TWardIdentity<FImpl>::TWardIdentity(const std::string name)
 : Module<WardIdentityPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TWardIdentity<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().action};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TWardIdentity<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWardIdentity<FImpl>::setup(void)
 {
    Ls_ = env().getObjectLs(par().q);
    if (Ls_ != env().getObjectLs(par().action))
    {
        HADRON_ERROR(Size, "Ls mismatch between quark action and propagator");
    }
    envTmpLat(PropagatorField, "tmp");
    envTmpLat(PropagatorField, "vector_WI");
    if (par().test_axial)
    {
        envTmpLat(PropagatorField, "psi");
        envTmpLat(LatticeComplex,  "PP");
        envTmpLat(LatticeComplex,  "axial_defect");
        envTmpLat(LatticeComplex,  "PJ5q");
    }
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWardIdentity<FImpl>::execute(void)
 {
    LOG(Message) << "Performing Ward Identity checks for quark '" << par().q
                 << "'." << std::endl;
    auto  &q   = envGet(PropagatorField, par().q);
    auto  &act = envGet(FMat, par().action);
    Gamma g5(Gamma::Algebra::Gamma5);
    // Compute D_mu V_mu, D here is backward derivative.
    envGetTmp(PropagatorField, tmp);
    envGetTmp(PropagatorField, vector_WI);
    vector_WI    = zero;
    for (unsigned int mu = 0; mu < Nd; ++mu)
    {
        act.ContractConservedCurrent(q, q, tmp, Current::Vector, mu);
        tmp -= Cshift(tmp, mu, -1);
        vector_WI += tmp;
    }
    // Test ward identity D_mu V_mu = 0;
    LOG(Message) << "Vector Ward Identity check Delta_mu V_mu = " 
                 << norm2(vector_WI) << std::endl;
    if (par().test_axial)
    {
        envGetTmp(PropagatorField, psi);
        envGetTmp(LatticeComplex, PP);
        envGetTmp(LatticeComplex, axial_defect);
        envGetTmp(LatticeComplex, PJ5q);
        std::vector<TComplex> axial_buf;
        // Compute <P|D_mu A_mu>, D is backwards derivative.
        axial_defect = zero;
        for (unsigned int mu = 0; mu < Nd; ++mu)
        {
            act.ContractConservedCurrent(q, q, tmp, Current::Axial, mu);
            tmp -= Cshift(tmp, mu, -1);
            axial_defect += trace(g5*tmp);
        }
        // Get <P|J5q> for 5D (zero for 4D) and <P|P>.
        PJ5q = zero;
        if (Ls_ > 1)
        {
            // <P|P>
            ExtractSlice(tmp, q, 0, 0);
            psi  = 0.5 * (tmp - g5*tmp);
            ExtractSlice(tmp, q, Ls_ - 1, 0);
            psi += 0.5 * (tmp + g5*tmp);
            PP = trace(adj(psi)*psi);
            // <P|5Jq>
            ExtractSlice(tmp, q, Ls_/2 - 1, 0);
            psi  = 0.5 * (tmp + g5*tmp);
            ExtractSlice(tmp, q, Ls_/2, 0);
            psi += 0.5 * (tmp - g5*tmp);
            PJ5q = trace(adj(psi)*psi);
        }
        else
        {
            PP = trace(adj(q)*q);
        }
        // Test ward identity <P|D_mu A_mu> = 2m<P|P> + 2<P|J5q>
        LOG(Message) << "|D_mu A_mu|^2 = " << norm2(axial_defect) << std::endl;
        LOG(Message) << "|PP|^2        = " << norm2(PP) << std::endl;
        LOG(Message) << "|PJ5q|^2      = " << norm2(PJ5q) << std::endl;
        LOG(Message) << "Axial Ward Identity defect Delta_mu A_mu = "
                     << norm2(axial_defect) << std::endl;
        // Axial defect by timeslice.
        axial_defect -= 2.*(par().mass*PP + PJ5q);
        LOG(Message) << "Check Axial defect by timeslice" << std::endl;
        sliceSum(axial_defect, axial_buf, Tp);
        for (int t = 0; t < axial_buf.size(); ++t)
        {
            LOG(Message) << "t = " << t << ": " 
                         << TensorRemove(axial_buf[t]) << std::endl;
        }
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_WardIdentity_hpp_
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
@@ -4,9 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
-Copyright (C) 2017
+Copyright (C) 2015-2018
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -76,6 +77,7 @@ public:
                                    std::string, q2,
                                    std::string, q3,
                                    std::string, q4,
                                    unsigned int, tSnk,
                                    std::string, output);
 };
@@ -99,11 +101,13 @@ public:\
    /* dependency relation */ \
    virtual std::vector<std::string> getInput(void);\
    virtual std::vector<std::string> getOutput(void);\
 public:\
    std::vector<std::string> VA_label = {"V", "A"};\
 protected:\
    /* setup */ \
    virtual void setup(void);\
    /* execution */ \
    virtual void execute(void);\
    std::vector<std::string> VA_label = {"V", "A"};\
 };\
 MODULE_REGISTER_NS(modname, T##modname, MContraction);
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
@@ -4,9 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
-Copyright (C) 2017
+Copyright (C) 2015-2018
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -54,6 +55,8 @@ using namespace MContraction;
 * 
 * S: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1]*q4*gL[mu][p_2])
 * E: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1])*trace(q4*gL[mu][p_2])
 * 
 * Note q1 must be sink smeared.
 */
 /******************************************************************************
@@ -74,7 +77,7 @@ std::vector<std::string> TWeakHamiltonianEye::getInput(void)
 std::vector<std::string> TWeakHamiltonianEye::getOutput(void)
 {
-    std::vector<std::string> out = {getName()};
+    std::vector<std::string> out = {};
    return out;
 }
@@ -82,7 +85,15 @@ std::vector<std::string> TWeakHamiltonianEye::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 void TWeakHamiltonianEye::setup(void)
 {
    unsigned int ndim = env().getNd();
    envTmpLat(LatticeComplex,  "expbuf");
    envTmpLat(PropagatorField, "tmp1");
    envTmpLat(LatticeComplex,  "tmp2");
    envTmp(std::vector<PropagatorField>, "S_body", 1, ndim, PropagatorField(env().getGrid()));
    envTmp(std::vector<PropagatorField>, "S_loop", 1, ndim, PropagatorField(env().getGrid()));
    envTmp(std::vector<LatticeComplex>,  "E_body", 1, ndim, LatticeComplex(env().getGrid()));
    envTmp(std::vector<LatticeComplex>,  "E_loop", 1, ndim, LatticeComplex(env().getGrid()));
 }
 // execution ///////////////////////////////////////////////////////////////////
@@ -93,28 +104,31 @@ void TWeakHamiltonianEye::execute(void)
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
-    CorrWriter             writer(par().output);
+    ResultWriter           writer(RESULT_FILE_NAME(par().output));
-    PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1);
+    auto                   &q1 = envGet(SlicedPropagator, par().q1);
-    PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2);
+    auto                   &q2 = envGet(PropagatorField, par().q2);
-    PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3);
+    auto                   &q3 = envGet(PropagatorField, par().q3);
-    PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4);
+    auto                   &q4 = envGet(PropagatorField, par().q4);
    Gamma                  g5  = Gamma(Gamma::Algebra::Gamma5);
    LatticeComplex        expbuf(env().getGrid());
    std::vector<TComplex>  corrbuf;
    std::vector<Result>    result(n_eye_diag);
    unsigned int ndim    = env().getNd();
-    PropagatorField              tmp1(env().getGrid());
+    envGetTmp(LatticeComplex,               expbuf); 
-    LatticeComplex               tmp2(env().getGrid());
+    envGetTmp(PropagatorField,              tmp1);
-    std::vector<PropagatorField> S_body(ndim, tmp1);
+    envGetTmp(LatticeComplex,               tmp2);
-    std::vector<PropagatorField> S_loop(ndim, tmp1);
+    envGetTmp(std::vector<PropagatorField>, S_body);
-    std::vector<LatticeComplex>  E_body(ndim, tmp2);
+    envGetTmp(std::vector<PropagatorField>, S_loop);
-    std::vector<LatticeComplex>  E_loop(ndim, tmp2);
+    envGetTmp(std::vector<LatticeComplex>,  E_body);
    envGetTmp(std::vector<LatticeComplex>,  E_loop);
    // Get sink timeslice of q1.
    SitePropagator q1Snk = q1[par().tSnk];
    // Setup for S-type contractions.
    for (int mu = 0; mu < ndim; ++mu)
    {
-        S_body[mu] = MAKE_SE_BODY(q1, q2, q3, GammaL(Gamma::gmu[mu]));
+        S_body[mu] = MAKE_SE_BODY(q1Snk, q2, q3, GammaL(Gamma::gmu[mu]));
        S_loop[mu] = MAKE_SE_LOOP(q4, GammaL(Gamma::gmu[mu]));
    }
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
@@ -4,9 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
-Copyright (C) 2017
+Copyright (C) 2015-2018
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
@@ -4,9 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
-Copyright (C) 2017
+Copyright (C) 2015-2018
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -76,7 +77,7 @@ std::vector<std::string> TWeakHamiltonianNonEye::getInput(void)
 std::vector<std::string> TWeakHamiltonianNonEye::getOutput(void)
 {
-    std::vector<std::string> out = {getName()};
+    std::vector<std::string> out = {};
    return out;
 }
@@ -84,7 +85,15 @@ std::vector<std::string> TWeakHamiltonianNonEye::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 void TWeakHamiltonianNonEye::setup(void)
 {
    unsigned int ndim = env().getNd();
    envTmpLat(LatticeComplex,  "expbuf");
    envTmpLat(PropagatorField, "tmp1");
    envTmpLat(LatticeComplex,  "tmp2");
    envTmp(std::vector<PropagatorField>, "C_i_side_loop", 1, ndim, PropagatorField(env().getGrid()));
    envTmp(std::vector<PropagatorField>, "C_f_side_loop", 1, ndim, PropagatorField(env().getGrid()));
    envTmp(std::vector<LatticeComplex>,  "W_i_side_loop", 1, ndim, LatticeComplex(env().getGrid()));
    envTmp(std::vector<LatticeComplex>,  "W_f_side_loop", 1, ndim, LatticeComplex(env().getGrid()));
 }
 // execution ///////////////////////////////////////////////////////////////////
@@ -95,23 +104,23 @@ void TWeakHamiltonianNonEye::execute(void)
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
-    CorrWriter             writer(par().output);
+    ResultWriter          writer(RESULT_FILE_NAME(par().output));
-    PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1);
+    auto                  &q1 = envGet(PropagatorField, par().q1);
-    PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2);
+    auto                  &q2 = envGet(PropagatorField, par().q2);
-    PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3);
+    auto                  &q3 = envGet(PropagatorField, par().q3);
-    PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4);
+    auto                  &q4 = envGet(PropagatorField, par().q4);
    Gamma                 g5  = Gamma(Gamma::Algebra::Gamma5);
    LatticeComplex        expbuf(env().getGrid());
    std::vector<TComplex> corrbuf;
    std::vector<Result>   result(n_noneye_diag); 
    unsigned int          ndim = env().getNd();
-    PropagatorField              tmp1(env().getGrid());
+    envGetTmp(LatticeComplex,               expbuf); 
-    LatticeComplex               tmp2(env().getGrid());
+    envGetTmp(PropagatorField,              tmp1);
-    std::vector<PropagatorField> C_i_side_loop(ndim, tmp1);
+    envGetTmp(LatticeComplex,               tmp2);
-    std::vector<PropagatorField> C_f_side_loop(ndim, tmp1);
+    envGetTmp(std::vector<PropagatorField>, C_i_side_loop);
-    std::vector<LatticeComplex>  W_i_side_loop(ndim, tmp2);
+    envGetTmp(std::vector<PropagatorField>, C_f_side_loop);
-    std::vector<LatticeComplex>  W_f_side_loop(ndim, tmp2);
+    envGetTmp(std::vector<LatticeComplex>,  W_i_side_loop);
    envGetTmp(std::vector<LatticeComplex>,  W_f_side_loop);
    // Setup for C-type contractions.
    for (int mu = 0; mu < ndim; ++mu)
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
@@ -4,9 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
-Copyright (C) 2017
+Copyright (C) 2015-2018
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
--- a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
+++ b/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
@@ -4,9 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
-Copyright (C) 2017
+Copyright (C) 2015-2018
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -78,7 +79,7 @@ std::vector<std::string> TWeakNeutral4ptDisc::getInput(void)
 std::vector<std::string> TWeakNeutral4ptDisc::getOutput(void)
 {
-    std::vector<std::string> out = {getName()};
+    std::vector<std::string> out = {};
    return out;
 }
@@ -86,7 +87,13 @@ std::vector<std::string> TWeakNeutral4ptDisc::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 void TWeakNeutral4ptDisc::setup(void)
 {
    unsigned int ndim = env().getNd();
    envTmpLat(LatticeComplex,  "expbuf");
    envTmpLat(PropagatorField, "tmp");
    envTmpLat(LatticeComplex,  "curr");
    envTmp(std::vector<PropagatorField>, "meson", 1, ndim, PropagatorField(env().getGrid()));
    envTmp(std::vector<PropagatorField>, "loop", 1, ndim,  PropagatorField(env().getGrid()));
 }
 // execution ///////////////////////////////////////////////////////////////////
@@ -97,21 +104,21 @@ void TWeakNeutral4ptDisc::execute(void)
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
-    CorrWriter             writer(par().output);
+    ResultWriter          writer(RESULT_FILE_NAME(par().output));
-    PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1);
+    auto                  &q1 = envGet(PropagatorField, par().q1);
-    PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2);
+    auto                  &q2 = envGet(PropagatorField, par().q2);
-    PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3);
+    auto                  &q3 = envGet(PropagatorField, par().q3);
-    PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4);
+    auto                  &q4 = envGet(PropagatorField, par().q4);
    Gamma                 g5  = Gamma(Gamma::Algebra::Gamma5);
    LatticeComplex        expbuf(env().getGrid());
    std::vector<TComplex> corrbuf;
    std::vector<Result>   result(n_neut_disc_diag);
    unsigned int          ndim = env().getNd();
-    PropagatorField              tmp(env().getGrid());
+    envGetTmp(LatticeComplex,               expbuf); 
-    std::vector<PropagatorField> meson(ndim, tmp);
+    envGetTmp(PropagatorField,              tmp);
-    std::vector<PropagatorField> loop(ndim, tmp);
+    envGetTmp(LatticeComplex,               curr);
-    LatticeComplex               curr(env().getGrid());
+    envGetTmp(std::vector<PropagatorField>, meson);
    envGetTmp(std::vector<PropagatorField>, loop);
    // Setup for type 1 contractions.
    for (int mu = 0; mu < ndim; ++mu)
--- a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
@@ -4,9 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
-Copyright (C) 2017
+Copyright (C) 2015-2018
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
--- a/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
+++ b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
@@ -1,3 +1,32 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MFermion/GaugeProp.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MFermion_GaugeProp_hpp_
 #define Hadrons_MFermion_GaugeProp_hpp_
@@ -7,6 +36,27 @@
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 * 5D -> 4D and 4D -> 5D conversions.                                         *
 ******************************************************************************/
 template<class vobj> // Note that 5D object is modified.
 inline void make_4D(Lattice<vobj> &in_5d, Lattice<vobj> &out_4d, int Ls)
 {
    axpby_ssp_pminus(in_5d, 0., in_5d, 1., in_5d, 0, 0);
    axpby_ssp_pplus(in_5d, 1., in_5d, 1., in_5d, 0, Ls-1);
    ExtractSlice(out_4d, in_5d, 0, 0);
 }
 template<class vobj>
 inline void make_5D(Lattice<vobj> &in_4d, Lattice<vobj> &out_5d, int Ls)
 {
    out_5d = zero;
    InsertSlice(in_4d, out_5d, 0, 0);
    InsertSlice(in_4d, out_5d, Ls-1, 0);
    axpby_ssp_pplus(out_5d, 0., out_5d, 1., out_5d, 0, 0);
    axpby_ssp_pminus(out_5d, 0., out_5d, 1., out_5d, Ls-1, Ls-1);
 }
 /******************************************************************************
 *                                GaugeProp                                   *
 ******************************************************************************/
@@ -33,6 +83,7 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
@@ -43,7 +94,6 @@ private:
 };
 MODULE_REGISTER_NS(GaugeProp, TGaugeProp<FIMPL>, MFermion);
 /******************************************************************************
 *                      TGaugeProp implementation                             *
 ******************************************************************************/
@@ -75,10 +125,13 @@ template <typename FImpl>
 void TGaugeProp<FImpl>::setup(void)
 {
    Ls_ = env().getObjectLs(par().solver);
-    env().template registerLattice<PropagatorField>(getName());
+    envCreateLat(PropagatorField, getName());
    envTmpLat(FermionField, "source", Ls_);
    envTmpLat(FermionField, "sol", Ls_);
    envTmpLat(FermionField, "tmp");
    if (Ls_ > 1)
    {
-        env().template registerLattice<PropagatorField>(getName() + "_5d", Ls_);
+        envCreateLat(PropagatorField, getName() + "_5d", Ls_);
    }
 }
@@ -89,21 +142,18 @@ void TGaugeProp<FImpl>::execute(void)
    LOG(Message) << "Computing quark propagator '" << getName() << "'"
                 << std::endl;
    FermionField    source(env().getGrid(Ls_)), sol(env().getGrid(Ls_)),
    tmp(env().getGrid());
    std::string propName = (Ls_ == 1) ? getName() : (getName() + "_5d");
-    PropagatorField &prop    = *env().template createLattice<PropagatorField>(propName);
+    auto        &prop    = envGet(PropagatorField, propName);
-    PropagatorField &fullSrc = *env().template getObject<PropagatorField>(par().source);
+    auto        &fullSrc = envGet(PropagatorField, par().source);
-    SolverFn        &solver  = *env().template getObject<SolverFn>(par().solver);
+    auto        &solver  = envGet(SolverFn, par().solver);
    if (Ls_ > 1)
    {
        env().template createLattice<PropagatorField>(getName());
    }
    envGetTmp(FermionField, source);
    envGetTmp(FermionField, sol);
    envGetTmp(FermionField, tmp);
    LOG(Message) << "Inverting using solver '" << par().solver
                 << "' on source '" << par().source << "'" << std::endl;
    for (unsigned int s = 0; s < Ns; ++s)
-    for (unsigned int c = 0; c < Nc; ++c)
+      for (unsigned int c = 0; c < FImpl::Dimension; ++c)
    {
        LOG(Message) << "Inversion for spin= " << s << ", color= " << c
                     << std::endl;
@@ -112,16 +162,12 @@ void TGaugeProp<FImpl>::execute(void)
        {
            if (Ls_ == 1)
            {
-                PropToFerm(source, fullSrc, s, c);
+               PropToFerm<FImpl>(source, fullSrc, s, c);
            }
            else
            {
-                source = zero;
+                PropToFerm<FImpl>(tmp, fullSrc, s, c);
-                PropToFerm(tmp, fullSrc, s, c);
+                make_5D(tmp, source, Ls_);
                InsertSlice(tmp, source, 0, 0);
                InsertSlice(tmp, source, Ls_-1, 0);
                axpby_ssp_pplus(source, 0., source, 1., source, 0, 0);
                axpby_ssp_pminus(source, 0., source, 1., source, Ls_-1, Ls_-1);
            }
        }
        // source conversion for 5D sources
@@ -129,26 +175,22 @@ void TGaugeProp<FImpl>::execute(void)
        {
            if (Ls_ != env().getObjectLs(par().source))
            {
-                HADRON_ERROR("Ls mismatch between quark action and source");
+                HADRON_ERROR(Size, "Ls mismatch between quark action and source");
            }
            else
            {
-                PropToFerm(source, fullSrc, s, c);
+                PropToFerm<FImpl>(source, fullSrc, s, c);
            }
        }
        sol = zero;
        solver(sol, source);
-        FermToProp(prop, sol, s, c);
+        FermToProp<FImpl>(prop, sol, s, c);
        // create 4D propagators from 5D one if necessary
        if (Ls_ > 1)
        {
-            PropagatorField &p4d =
+            PropagatorField &p4d = envGet(PropagatorField, getName());
-            *env().template getObject<PropagatorField>(getName());
+            make_4D(sol, tmp, Ls_);
-            
+            FermToProp<FImpl>(p4d, tmp, s, c);
            axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0);
            axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1);
            ExtractSlice(tmp, sol, 0, 0);
            FermToProp(p4d, tmp, s, c);
        }
    }
 }
--- a/extras/Hadrons/Modules/MGauge/FundtoHirep.cc
+++ b/extras/Hadrons/Modules/MGauge/FundtoHirep.cc
@@ -0,0 +1,75 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MGauge/FundtoHirep.cc
 Copyright (C) 2015
 Copyright (C) 2016
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MGauge/FundtoHirep.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;
 // constructor /////////////////////////////////////////////////////////////////
 template <class Rep>
 TFundtoHirep<Rep>::TFundtoHirep(const std::string name)
 : Module<FundtoHirepPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <class Rep>
 std::vector<std::string> TFundtoHirep<Rep>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <class Rep>
 std::vector<std::string> TFundtoHirep<Rep>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename Rep>
 void TFundtoHirep<Rep>::setup(void)
 {
    envCreateLat(typename Rep::LatticeField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <class Rep>
 void TFundtoHirep<Rep>::execute(void)
 {
    auto &U      = *env().template getObject<LatticeGaugeField>(par().gaugeconf);
    LOG(Message) << "Transforming Representation" << std::endl;
    Rep TargetRepresentation(U._grid);
    TargetRepresentation.update_representation(U);
    auto &URep = envGet(typename Rep::LatticeField, getName());
    URep = TargetRepresentation.U;
 }
--- a/extras/Hadrons/Modules/MGauge/FundtoHirep.hpp
+++ b/extras/Hadrons/Modules/MGauge/FundtoHirep.hpp
@@ -0,0 +1,77 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MGauge/FundtoHirep.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Author: David Preti <david.preti@to.infn.it>
 	Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MGauge_FundtoHirep_hpp_
 #define Hadrons_MGauge_FundtoHirep_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Load a NERSC configuration                         *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 class FundtoHirepPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(FundtoHirepPar,
                                    std::string, gaugeconf);
 };
 template <class Rep>
 class TFundtoHirep: public Module<FundtoHirepPar>
 {
 public:
    // constructor
    TFundtoHirep(const std::string name);
    // destructor
    virtual ~TFundtoHirep(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    void setup(void);
    // execution
    void execute(void);
 };
 //MODULE_REGISTER_NS(FundtoAdjoint,   TFundtoHirep<AdjointRepresentation>, MGauge);
 //MODULE_REGISTER_NS(FundtoTwoIndexSym, TFundtoHirep<TwoIndexSymmetricRepresentation>, MGauge);
 //MODULE_REGISTER_NS(FundtoTwoIndexAsym, TFundtoHirep<TwoIndexAntiSymmetricRepresentation>, MGauge);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MGauge_FundtoHirep_hpp_
--- a/extras/Hadrons/Modules/MGauge/Random.cc
+++ b/extras/Hadrons/Modules/MGauge/Random.cc
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MGauge/Random.cc
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -44,7 +43,9 @@ TRandom::TRandom(const std::string name)
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TRandom::getInput(void)
 {
-    return std::vector<std::string>();
+    std::vector<std::string> in;
    return in;
 }
 std::vector<std::string> TRandom::getOutput(void)
@@ -57,13 +58,14 @@ std::vector<std::string> TRandom::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 void TRandom::setup(void)
 {
-    env().registerLattice<LatticeGaugeField>(getName());
+    envCreateLat(LatticeGaugeField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TRandom::execute(void)
 {
    LOG(Message) << "Generating random gauge configuration" << std::endl;
-    LatticeGaugeField &U = *env().createLattice<LatticeGaugeField>(getName());
+    
    auto &U = envGet(LatticeGaugeField, getName());
    SU3::HotConfiguration(*env().get4dRng(), U);
 }
--- a/extras/Hadrons/Modules/MGauge/Random.hpp
+++ b/extras/Hadrons/Modules/MGauge/Random.hpp
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MGauge/Random.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -51,6 +50,7 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
--- a/extras/Hadrons/Modules/MGauge/StochEm.cc
+++ b/extras/Hadrons/Modules/MGauge/StochEm.cc
@@ -4,9 +4,9 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MGauge/StochEm.cc
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -57,32 +57,28 @@ std::vector<std::string> TStochEm::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 void TStochEm::setup(void)
 {
-    if (!env().hasRegisteredObject("_" + getName() + "_weight"))
+    if (!env().hasCreatedObject("_" + getName() + "_weight"))
    {
-        env().registerLattice<EmComp>("_" + getName() + "_weight");
+        envCacheLat(EmComp, "_" + getName() + "_weight");
    }
-    env().registerLattice<EmField>(getName());
+    envCreateLat(EmField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TStochEm::execute(void)
 {
    LOG(Message) << "Generating stochatic EM potential..." << std::endl;
    PhotonR photon(par().gauge, par().zmScheme);
-    EmField &a = *env().createLattice<EmField>(getName());
+    auto    &a = envGet(EmField, getName());
-    EmComp  *w;
+    auto    &w = envGet(EmComp, "_" + getName() + "_weight");
    if (!env().hasCreatedObject("_" + getName() + "_weight"))
    {
        LOG(Message) << "Caching stochatic EM potential weight (gauge: "
                     << par().gauge << ", zero-mode scheme: "
                     << par().zmScheme << ")..." << std::endl;
-        w = env().createLattice<EmComp>("_" + getName() + "_weight");
+        photon.StochasticWeight(w);
        photon.StochasticWeight(*w);
    }
-    else
+    photon.StochasticField(a, *env().get4dRng(), w);
    {
        w = env().getObject<EmComp>("_" + getName() + "_weight");
    }
    LOG(Message) << "Generating stochatic EM potential..." << std::endl;
    photon.StochasticField(a, *env().get4dRng(), *w);
 }
--- a/extras/Hadrons/Modules/MGauge/StochEm.hpp
+++ b/extras/Hadrons/Modules/MGauge/StochEm.hpp
@@ -4,9 +4,9 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MGauge/StochEm.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -60,6 +60,7 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
--- a/extras/Hadrons/Modules/MGauge/Unit.cc
+++ b/extras/Hadrons/Modules/MGauge/Unit.cc
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MGauge/Unit.cc
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -57,13 +56,14 @@ std::vector<std::string> TUnit::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 void TUnit::setup(void)
 {
-    env().registerLattice<LatticeGaugeField>(getName());
+    envCreateLat(LatticeGaugeField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TUnit::execute(void)
 {
    LOG(Message) << "Creating unit gauge configuration" << std::endl;
-    LatticeGaugeField &U = *env().createLattice<LatticeGaugeField>(getName());
+    
    auto &U = envGet(LatticeGaugeField, getName());
    SU3::ColdConfiguration(*env().get4dRng(), U);
 }
--- a/extras/Hadrons/Modules/MGauge/Unit.hpp
+++ b/extras/Hadrons/Modules/MGauge/Unit.hpp
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MGauge/Unit.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -51,6 +50,7 @@ public:
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
--- a/extras/Hadrons/Modules/MIO/LoadBinary.hpp
+++ b/extras/Hadrons/Modules/MIO/LoadBinary.hpp
@@ -0,0 +1,140 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MIO/LoadBinary.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MIO_LoadBinary_hpp_
 #define Hadrons_MIO_LoadBinary_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                       Load a binary configurations                         *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MIO)
 class LoadBinaryPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadBinaryPar,
                                    std::string, file,
                                    std::string, format);
 };
 template <typename Impl>
 class TLoadBinary: public Module<LoadBinaryPar>
 {
 public:
    typedef typename Impl::Field                  Field;
    typedef typename Impl::Simd                   Simd;
    typedef typename Field::vector_object         vobj;
    typedef typename vobj::scalar_object          sobj;
    typedef typename sobj::DoublePrecision        sobj_double;
    typedef BinarySimpleMunger<sobj_double, sobj> Munger;
 public:
    // constructor
    TLoadBinary(const std::string name);
    // destructor
    virtual ~TLoadBinary(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(LoadBinary, TLoadBinary<GIMPL>, MIO);
 MODULE_REGISTER_NS(LoadBinaryScalarSU2, TLoadBinary<ScalarNxNAdjImplR<2>>, MIO);
 MODULE_REGISTER_NS(LoadBinaryScalarSU3, TLoadBinary<ScalarNxNAdjImplR<3>>, MIO);
 MODULE_REGISTER_NS(LoadBinaryScalarSU4, TLoadBinary<ScalarNxNAdjImplR<4>>, MIO);
 MODULE_REGISTER_NS(LoadBinaryScalarSU5, TLoadBinary<ScalarNxNAdjImplR<5>>, MIO);
 MODULE_REGISTER_NS(LoadBinaryScalarSU6, TLoadBinary<ScalarNxNAdjImplR<6>>, MIO);
 /******************************************************************************
 *                         TLoadBinary implementation                         *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename Impl>
 TLoadBinary<Impl>::TLoadBinary(const std::string name)
 : Module<LoadBinaryPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename Impl>
 std::vector<std::string> TLoadBinary<Impl>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <typename Impl>
 std::vector<std::string> TLoadBinary<Impl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename Impl>
 void TLoadBinary<Impl>::setup(void)
 {
    envCreateLat(Field, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename Impl>
 void TLoadBinary<Impl>::execute(void)
 {
    Munger      munge;
    uint32_t    nersc_csum, scidac_csuma, scidac_csumb;
    auto        &U = envGet(Field, getName());
    std::string filename = par().file + "."
                           + std::to_string(vm().getTrajectory());
    LOG(Message) << "Loading " << par().format 
                 << " binary configuration from file '" << filename
                 << "'" << std::endl;
    BinaryIO::readLatticeObject<vobj, sobj_double>(U, filename, munge, 0, 
                                                   par().format, nersc_csum,
                                                   scidac_csuma, scidac_csumb);
    LOG(Message) << "Checksums:" << std::endl;
    LOG(Message) << "  NERSC    " << nersc_csum << std::endl;
    LOG(Message) << "  SciDAC A " << scidac_csuma << std::endl;
    LOG(Message) << "  SciDAC B " << scidac_csumb << std::endl;
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MIO_LoadBinary_hpp_
--- a/extras/Hadrons/Modules/MIO/LoadNersc.cc
+++ b/extras/Hadrons/Modules/MIO/LoadNersc.cc
@@ -2,10 +2,9 @@
 Grid physics library, www.github.com/paboyle/Grid 
-Source file: extras/Hadrons/Modules/MGauge/Load.cc
+Source file: extras/Hadrons/Modules/MIO/LoadNersc.cc
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -26,30 +25,29 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-
+#include <Grid/Hadrons/Modules/MIO/LoadNersc.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Load.hpp>
 using namespace Grid;
 using namespace Hadrons;
-using namespace MGauge;
+using namespace MIO;
 /******************************************************************************
-*                           TLoad implementation                               *
+*                       TLoadNersc implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
-TLoad::TLoad(const std::string name)
+TLoadNersc::TLoadNersc(const std::string name)
-: Module<LoadPar>(name)
+: Module<LoadNerscPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TLoad::getInput(void)
+std::vector<std::string> TLoadNersc::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
-std::vector<std::string> TLoad::getOutput(void)
+std::vector<std::string> TLoadNersc::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
@@ -57,21 +55,21 @@ std::vector<std::string> TLoad::getOutput(void)
 }
 // setup ///////////////////////////////////////////////////////////////////////
-void TLoad::setup(void)
+void TLoadNersc::setup(void)
 {
-    env().registerLattice<LatticeGaugeField>(getName());
+    envCreateLat(LatticeGaugeField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
-void TLoad::execute(void)
+void TLoadNersc::execute(void)
 {
    FieldMetaData header;
    std::string   fileName = par().file + "."
-                           + std::to_string(env().getTrajectory());
+                             + std::to_string(vm().getTrajectory());
    LOG(Message) << "Loading NERSC configuration from file '" << fileName
                 << "'" << std::endl;
-    LatticeGaugeField &U = *env().createLattice<LatticeGaugeField>(getName());
+
    auto &U = envGet(LatticeGaugeField, getName());
    NerscIO::readConfiguration(U, header, fileName);
    LOG(Message) << "NERSC header:" << std::endl;
    dump_meta_data(header, LOG(Message));
--- a/extras/Hadrons/Modules/MIO/LoadNersc.hpp
+++ b/extras/Hadrons/Modules/MIO/LoadNersc.hpp
@@ -2,10 +2,9 @@
 Grid physics library, www.github.com/paboyle/Grid 
-Source file: extras/Hadrons/Modules/MGauge/Load.hpp
+Source file: extras/Hadrons/Modules/MIO/LoadNersc.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -26,9 +25,8 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-
+#ifndef Hadrons_MIO_LoadNersc_hpp_
-#ifndef Hadrons_MGauge_Load_hpp_
+#define Hadrons_MIO_LoadNersc_hpp_
 #define Hadrons_MGauge_Load_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -39,22 +37,22 @@ BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                       Load a NERSC configuration                           *
 ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MGauge)
+BEGIN_MODULE_NAMESPACE(MIO)
-class LoadPar: Serializable
+class LoadNerscPar: Serializable
 {
 public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadPar,
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadNerscPar,
                                    std::string, file);
 };
-class TLoad: public Module<LoadPar>
+class TLoadNersc: public Module<LoadNerscPar>
 {
 public:
    // constructor
-    TLoad(const std::string name);
+    TLoadNersc(const std::string name);
    // destructor
-    virtual ~TLoad(void) = default;
+    virtual ~TLoadNersc(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
@@ -64,10 +62,10 @@ public:
    virtual void execute(void);
 };
-MODULE_REGISTER_NS(Load, TLoad, MGauge);
+MODULE_REGISTER_NS(LoadNersc, TLoadNersc, MIO);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_MGauge_Load_hpp_
+#endif // Hadrons_MIO_LoadNersc_hpp_
--- a/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
+++ b/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
@@ -4,9 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
-Copyright (C) 2016
+Copyright (C) 2015-2018
-Author: Andrew Lawson <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -74,6 +75,7 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
@@ -112,16 +114,16 @@ std::vector<std::string> TNoiseLoop<FImpl>::getOutput(void)
 template <typename FImpl>
 void TNoiseLoop<FImpl>::setup(void)
 {
-    env().template registerLattice<PropagatorField>(getName());
+    envCreateLat(PropagatorField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TNoiseLoop<FImpl>::execute(void)
 {
-    PropagatorField &loop = *env().template createLattice<PropagatorField>(getName());
+    auto &loop = envGet(PropagatorField, getName());
-    PropagatorField &q    = *env().template getObject<PropagatorField>(par().q);
+    auto &q    = envGet(PropagatorField, par().q);
-    PropagatorField &eta  = *env().template getObject<PropagatorField>(par().eta);
+    auto &eta  = envGet(PropagatorField, par().eta);
    loop = q*adj(eta);
 }
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -1,3 +1,31 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalar/ChargedProp.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <jch1g10@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
@@ -37,89 +65,43 @@ void TChargedProp::setup(void)
    {
        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
    }
-    GFSrcName_ = "_" + getName() + "_DinvSrc";
+    GFSrcName_ = getName() + "_DinvSrc";
-    if (!env().hasRegisteredObject(freeMomPropName_))
+    fftName_   = getName() + "_fft";
-    {
+
-        env().registerLattice<ScalarField>(freeMomPropName_);
+    freeMomPropDone_ = env().hasCreatedObject(freeMomPropName_);
-    }
+    GFSrcDone_       = env().hasCreatedObject(GFSrcName_);
-    if (!env().hasRegisteredObject(phaseName_[0]))
+    phasesDone_      = env().hasCreatedObject(phaseName_[0]);
-    {
+    envCacheLat(ScalarField, freeMomPropName_);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
-            env().registerLattice<ScalarField>(phaseName_[mu]);
+        envCacheLat(ScalarField, phaseName_[mu]);
    }
-    }
+    envCacheLat(ScalarField, GFSrcName_);
-    if (!env().hasRegisteredObject(GFSrcName_))
+    envCreateLat(ScalarField, getName());
-    {
+    envTmpLat(ScalarField, "buf");
-        env().registerLattice<ScalarField>(GFSrcName_);
+    envTmpLat(ScalarField, "result");
-    }
+    envTmpLat(ScalarField, "Amu");
-    env().registerLattice<ScalarField>(getName());
+    envCache(FFT, fftName_, 1, env().getGrid());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TChargedProp::execute(void)
 {
    // CACHING ANALYTIC EXPRESSIONS
-    ScalarField &source = *env().getObject<ScalarField>(par().source);
+    makeCaches();
    Complex     ci(0.0,1.0);
    FFT         fft(env().getGrid());
    // cache free scalar propagator
    if (!env().hasCreatedObject(freeMomPropName_))
    {
        LOG(Message) << "Caching momentum space free scalar propagator"
                     << " (mass= " << par().mass << ")..." << std::endl;
        freeMomProp_ = env().createLattice<ScalarField>(freeMomPropName_);
        SIMPL::MomentumSpacePropagator(*freeMomProp_, par().mass);
    }
    else
    {
        freeMomProp_ = env().getObject<ScalarField>(freeMomPropName_);
    }
    // cache G*F*src
    if (!env().hasCreatedObject(GFSrcName_))
    {
        GFSrc_ = env().createLattice<ScalarField>(GFSrcName_);
        fft.FFT_all_dim(*GFSrc_, source, FFT::forward);
        *GFSrc_ = (*freeMomProp_)*(*GFSrc_);
    }
    else
    {
        GFSrc_ = env().getObject<ScalarField>(GFSrcName_);
    }
    // cache phases
    if (!env().hasCreatedObject(phaseName_[0]))
    {
        std::vector<int> &l = env().getGrid()->_fdimensions;
        LOG(Message) << "Caching shift phases..." << std::endl;
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            Real    twoPiL = M_PI*2./l[mu];
            phase_.push_back(env().createLattice<ScalarField>(phaseName_[mu]));
            LatticeCoordinate(*(phase_[mu]), mu);
            *(phase_[mu]) = exp(ci*twoPiL*(*(phase_[mu])));
        }
    }
    else
    {
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            phase_.push_back(env().getObject<ScalarField>(phaseName_[mu]));
        }
    }
    // PROPAGATOR CALCULATION
    LOG(Message) << "Computing charged scalar propagator"
                 << " (mass= " << par().mass
                 << ", charge= " << par().charge << ")..." << std::endl;
-    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
+    auto   &prop  = envGet(ScalarField, getName());
-    ScalarField buf(env().getGrid());
+    auto   &GFSrc = envGet(ScalarField, GFSrcName_);
-    ScalarField &GFSrc = *GFSrc_, &G = *freeMomProp_;
+    auto   &G     = envGet(ScalarField, freeMomPropName_);
    auto   &fft   = envGet(FFT, fftName_);
    double q      = par().charge;
    envGetTmp(ScalarField, result); 
    envGetTmp(ScalarField, buf); 
    // G*F*Src
    prop = GFSrc;
@@ -146,12 +128,12 @@ void TChargedProp::execute(void)
    if (!par().output.empty())
    {
        std::string           filename = par().output + "." +
-                                         std::to_string(env().getTrajectory());
+                                         std::to_string(vm().getTrajectory());
        LOG(Message) << "Saving zero-momentum projection to '"
                     << filename << "'..." << std::endl;
-        CorrWriter            writer(filename);
+        ResultWriter          writer(RESULT_FILE_NAME(par().output));
        std::vector<TComplex> vecBuf;
        std::vector<Complex>  result;
@@ -166,15 +148,55 @@ void TChargedProp::execute(void)
    }
 }
-void TChargedProp::momD1(ScalarField &s, FFT &fft)
+void TChargedProp::makeCaches(void)
 {
-    EmField     &A = *env().getObject<EmField>(par().emField);
+    auto &freeMomProp = envGet(ScalarField, freeMomPropName_);
-    ScalarField buf(env().getGrid()), result(env().getGrid()),
+    auto &GFSrc       = envGet(ScalarField, GFSrcName_);
-                Amu(env().getGrid());
+    auto &fft         = envGet(FFT, fftName_);
    if (!freeMomPropDone_)
    {
        LOG(Message) << "Caching momentum space free scalar propagator"
                     << " (mass= " << par().mass << ")..." << std::endl;
        SIMPL::MomentumSpacePropagator(freeMomProp, par().mass);
    }
    if (!GFSrcDone_)
    {   
        FFT  fft(env().getGrid());
        auto &source = envGet(ScalarField, par().source);
        LOG(Message) << "Caching G*F*src..." << std::endl;
        fft.FFT_all_dim(GFSrc, source, FFT::forward);
        GFSrc = freeMomProp*GFSrc;
    }
    if (!phasesDone_)
    {
        std::vector<int> &l = env().getGrid()->_fdimensions;
        Complex          ci(0.0,1.0);
-    result = zero;
+        LOG(Message) << "Caching shift phases..." << std::endl;
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            Real twoPiL = M_PI*2./l[mu];
            auto &phmu  = envGet(ScalarField, phaseName_[mu]);
            LatticeCoordinate(phmu, mu);
            phmu = exp(ci*twoPiL*phmu);
            phase_.push_back(&phmu);
        }
    }
 }
 void TChargedProp::momD1(ScalarField &s, FFT &fft)
 {
    auto        &A = envGet(EmField, par().emField);
    Complex     ci(0.0,1.0);
    envGetTmp(ScalarField, buf);
    envGetTmp(ScalarField, result);
    envGetTmp(ScalarField, Amu);
    result = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
@@ -198,12 +220,13 @@ void TChargedProp::momD1(ScalarField &s, FFT &fft)
 void TChargedProp::momD2(ScalarField &s, FFT &fft)
 {
-    EmField     &A = *env().getObject<EmField>(par().emField);
+    auto &A = envGet(EmField, par().emField);
-    ScalarField buf(env().getGrid()), result(env().getGrid()),
+
-                Amu(env().getGrid());
+    envGetTmp(ScalarField, buf);
    envGetTmp(ScalarField, result);
    envGetTmp(ScalarField, Amu);
    result = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
@@ -1,3 +1,30 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalar/ChargedProp.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalar_ChargedProp_hpp_
 #define Hadrons_MScalar_ChargedProp_hpp_
@@ -37,19 +64,20 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    void makeCaches(void);
    void momD1(ScalarField &s, FFT &fft);
    void momD2(ScalarField &s, FFT &fft);
 private:
-    std::string                freeMomPropName_, GFSrcName_;
+    bool                       freeMomPropDone_, GFSrcDone_, phasesDone_;
    std::string                freeMomPropName_, GFSrcName_, fftName_;
    std::vector<std::string>   phaseName_;
    ScalarField                *freeMomProp_, *GFSrc_;
    std::vector<ScalarField *> phase_;
    EmField                    *A;
 };
 MODULE_REGISTER_NS(ChargedProp, TChargedProp, MScalar);
--- a/extras/Hadrons/Modules/MScalar/FreeProp.cc
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.cc
@@ -1,3 +1,30 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalar/FreeProp.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
@@ -33,38 +60,31 @@ void TFreeProp::setup(void)
 {
    freeMomPropName_ = FREEMOMPROP(par().mass);
-    if (!env().hasRegisteredObject(freeMomPropName_))
+    freePropDone_ = env().hasCreatedObject(freeMomPropName_);
-    {
+    envCacheLat(ScalarField, freeMomPropName_);
-        env().registerLattice<ScalarField>(freeMomPropName_);
+    envCreateLat(ScalarField, getName());
    }
    env().registerLattice<ScalarField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TFreeProp::execute(void)
 {
-    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
+    auto &freeMomProp = envGet(ScalarField, freeMomPropName_);
-    ScalarField &source = *env().getObject<ScalarField>(par().source);
+    auto &prop        = envGet(ScalarField, getName());
-    ScalarField *freeMomProp;
+    auto &source      = envGet(ScalarField, par().source);
-    if (!env().hasCreatedObject(freeMomPropName_))
+    if (!freePropDone_)
    {
        LOG(Message) << "Caching momentum space free scalar propagator"
                     << " (mass= " << par().mass << ")..." << std::endl;
-        freeMomProp = env().createLattice<ScalarField>(freeMomPropName_);
+        SIMPL::MomentumSpacePropagator(freeMomProp, par().mass);
        SIMPL::MomentumSpacePropagator(*freeMomProp, par().mass);
    }
    else
    {
        freeMomProp = env().getObject<ScalarField>(freeMomPropName_);
    }
    LOG(Message) << "Computing free scalar propagator..." << std::endl;
-    SIMPL::FreePropagator(source, prop, *freeMomProp);
+    SIMPL::FreePropagator(source, prop, freeMomProp);
    if (!par().output.empty())
    {
        TextWriter            writer(par().output + "." +
-                                     std::to_string(env().getTrajectory()));
+                                     std::to_string(vm().getTrajectory()));
        std::vector<TComplex> buf;
        std::vector<Complex>  result;
--- a/extras/Hadrons/Modules/MScalar/FreeProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.hpp
@@ -1,3 +1,30 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalar/FreeProp.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalar_FreeProp_hpp_
 #define Hadrons_MScalar_FreeProp_hpp_
@@ -33,12 +60,14 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    std::string freeMomPropName_;
    bool        freePropDone_;
 };
 MODULE_REGISTER_NS(FreeProp, TFreeProp, MScalar);
--- a/extras/Hadrons/Modules/MScalar/Scalar.hpp
+++ b/extras/Hadrons/Modules/MScalar/Scalar.hpp
@@ -1,3 +1,30 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalar/Scalar.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Scalar_hpp_
 #define Hadrons_Scalar_hpp_
--- a/extras/Hadrons/Modules/MScalarSUN/Div.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/Div.hpp
@@ -0,0 +1,166 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalarSUN/Div.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalarSUN_Div_hpp_
 #define Hadrons_MScalarSUN_Div_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Div                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
 class DivPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_ENUM(DiffType, undef, forward, 1, backward, 2, central, 3);
    GRID_SERIALIZABLE_CLASS_MEMBERS(DivPar,
                                    std::vector<std::string>, op,
                                    DiffType,                 type,
                                    std::string,              output);
 };
 template <typename SImpl>
 class TDiv: public Module<DivPar>
 {
 public:
    typedef typename SImpl::Field        Field;
    typedef typename SImpl::ComplexField ComplexField;
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        DivPar::DiffType, type,
                                        Complex,          value);
    };
 public:
    // constructor
    TDiv(const std::string name);
    // destructor
    virtual ~TDiv(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(DivSU2, TDiv<ScalarNxNAdjImplR<2>>, MScalarSUN);
 MODULE_REGISTER_NS(DivSU3, TDiv<ScalarNxNAdjImplR<3>>, MScalarSUN);
 MODULE_REGISTER_NS(DivSU4, TDiv<ScalarNxNAdjImplR<4>>, MScalarSUN);
 MODULE_REGISTER_NS(DivSU5, TDiv<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_NS(DivSU6, TDiv<ScalarNxNAdjImplR<6>>, MScalarSUN);
 /******************************************************************************
 *                 TDiv implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
 TDiv<SImpl>::TDiv(const std::string name)
 : Module<DivPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename SImpl>
 std::vector<std::string> TDiv<SImpl>::getInput(void)
 {
    return par().op;
 }
 template <typename SImpl>
 std::vector<std::string> TDiv<SImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TDiv<SImpl>::setup(void)
 {
    if (par().op.size() != env().getNd())
    {
        HADRON_ERROR(Size, "the number of components differs from number of dimensions");
    }
    envCreateLat(ComplexField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TDiv<SImpl>::execute(void)
 {
    const auto nd = env().getNd();
    LOG(Message) << "Computing the " << par().type << " divergence of [";
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
        std::cout << par().op[mu] << ((mu == nd - 1) ? "]" : ", ");
    }
    std::cout << std::endl;
    auto &div = envGet(ComplexField, getName());
    div = zero;
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
        auto &op = envGet(ComplexField, par().op[mu]);
        switch(par().type)
        {
            case DivPar::DiffType::backward:
                div += op - Cshift(op, mu, -1);
                break;
            case DivPar::DiffType::forward:
                div += Cshift(op, mu, 1) - op;
                break;
            case DivPar::DiffType::central:
                div += 0.5*(Cshift(op, mu, 1) - Cshift(op, mu, -1));
                break;
        }
    }
    if (!par().output.empty())
    {
        Result       r;
        ResultWriter writer(RESULT_FILE_NAME(par().output));
        r.type  = par().type;
        r.value = TensorRemove(sum(div));
        write(writer, "div", r);
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalarSUN_Div_hpp_
--- a/extras/Hadrons/Modules/MScalarSUN/TrMag.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TrMag.hpp
@@ -0,0 +1,146 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalarSUN/TrMag.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalarSUN_TrMag_hpp_
 #define Hadrons_MScalarSUN_TrMag_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                       Module to compute tr(mag^n)                          *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
 class TrMagPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TrMagPar,
                                    std::string,  field,
                                    unsigned int, maxPow,
                                    std::string,  output);
 };
 template <typename SImpl>
 class TTrMag: public Module<TrMagPar>
 {
 public:
    typedef typename SImpl::Field        Field;
    typedef typename SImpl::ComplexField ComplexField;
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::string, op,
                                        Real,        value);
    };
 public:
    // constructor
    TTrMag(const std::string name);
    // destructor
    virtual ~TTrMag(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(TrMagSU2, TTrMag<ScalarNxNAdjImplR<2>>, MScalarSUN);
 MODULE_REGISTER_NS(TrMagSU3, TTrMag<ScalarNxNAdjImplR<3>>, MScalarSUN);
 MODULE_REGISTER_NS(TrMagSU4, TTrMag<ScalarNxNAdjImplR<4>>, MScalarSUN);
 MODULE_REGISTER_NS(TrMagSU5, TTrMag<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_NS(TrMagSU6, TTrMag<ScalarNxNAdjImplR<6>>, MScalarSUN);
 /******************************************************************************
 *                         TTrMag implementation                              *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
 TTrMag<SImpl>::TTrMag(const std::string name)
 : Module<TrMagPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename SImpl>
 std::vector<std::string> TTrMag<SImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().field};
    return in;
 }
 template <typename SImpl>
 std::vector<std::string> TTrMag<SImpl>::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TTrMag<SImpl>::setup(void)
 {}
 // execution ///////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TTrMag<SImpl>::execute(void)
 {
    LOG(Message) << "Computing tr(mag^n) for n even up to " << par().maxPow
                 << "..." << std::endl;
    std::vector<Result> result;
    ResultWriter        writer(RESULT_FILE_NAME(par().output));
    auto                &phi = envGet(Field, par().field);
    auto m2 = sum(phi), mn = m2;
    m2 = -m2*m2;
    mn = 1.;
    for (unsigned int n = 2; n <= par().maxPow; n += 2)
    {
        Result r;
        mn = mn*m2;
        r.op    = "tr(mag^" + std::to_string(n) + ")";
        r.value = TensorRemove(trace(mn)).real();
        result.push_back(r);
    }
    write(writer, "trmag", result);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalarSUN_TrMag_hpp_
--- a/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
@@ -0,0 +1,182 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalarSUN_TrPhi_hpp_
 #define Hadrons_MScalarSUN_TrPhi_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Module to compute tr(phi^n)                        *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
 class TrPhiPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TrPhiPar,
                                    std::string,  field,
                                    unsigned int, maxPow,
                                    std::string,  output);
 };
 template <typename SImpl>
 class TTrPhi: public Module<TrPhiPar>
 {
 public:
    typedef typename SImpl::Field        Field;
    typedef typename SImpl::ComplexField ComplexField;
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::string, op,
                                        Real,        value);
    };
 public:
    // constructor
    TTrPhi(const std::string name);
    // destructor
    virtual ~TTrPhi(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    // output name generator
    std::string outName(const unsigned int n);
 };
 MODULE_REGISTER_NS(TrPhiSU2, TTrPhi<ScalarNxNAdjImplR<2>>, MScalarSUN);
 MODULE_REGISTER_NS(TrPhiSU3, TTrPhi<ScalarNxNAdjImplR<3>>, MScalarSUN);
 MODULE_REGISTER_NS(TrPhiSU4, TTrPhi<ScalarNxNAdjImplR<4>>, MScalarSUN);
 MODULE_REGISTER_NS(TrPhiSU5, TTrPhi<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_NS(TrPhiSU6, TTrPhi<ScalarNxNAdjImplR<6>>, MScalarSUN);
 /******************************************************************************
 *                          TTrPhi implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
 TTrPhi<SImpl>::TTrPhi(const std::string name)
 : Module<TrPhiPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename SImpl>
 std::vector<std::string> TTrPhi<SImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().field};
    return in;
 }
 template <typename SImpl>
 std::vector<std::string> TTrPhi<SImpl>::getOutput(void)
 {
    std::vector<std::string> out;
    for (unsigned int n = 2; n <= par().maxPow; n += 2)
    {
        out.push_back(outName(n));
    }
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TTrPhi<SImpl>::setup(void)
 {
    if (par().maxPow < 2)
    {
        HADRON_ERROR(Size, "'maxPow' should be at least equal to 2");
    }
    envTmpLat(Field, "phi2");
    envTmpLat(Field, "buf");
    for (unsigned int n = 2; n <= par().maxPow; n += 2)
    {
        envCreateLat(ComplexField, outName(n));
    }
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TTrPhi<SImpl>::execute(void)
 {
    LOG(Message) << "Computing tr(phi^n) for n even up to " << par().maxPow
                 << "..." << std::endl; 
    std::vector<Result> result;
    auto                &phi = envGet(Field, par().field);
    envGetTmp(Field, phi2);
    envGetTmp(Field, buf);
    buf  = 1.;
    phi2 = -phi*phi; 
    for (unsigned int n = 2; n <= par().maxPow; n += 2)
    {
        auto &phin = envGet(ComplexField, outName(n));
        buf  = buf*phi2;
        phin = trace(buf);
        if (!par().output.empty())
        {
            Result r;
            r.op    = "tr(phi^" + std::to_string(n) + ")";
            r.value = TensorRemove(sum(phin)).real();
            result.push_back(r);
        }
    }
    if (result.size() > 0)
    {
        ResultWriter writer(RESULT_FILE_NAME(par().output));
        write(writer, "trphi", result);
    }
 }
 // output name generator ///////////////////////////////////////////////////////
 template <typename SImpl>
 std::string TTrPhi<SImpl>::outName(const unsigned int n)
 {
    return getName() + "_" + std::to_string(n);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalarSUN_TrPhi_hpp_
--- a/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
@@ -0,0 +1,184 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalarSUN_TwoPoint_hpp_
 #define Hadrons_MScalarSUN_TwoPoint_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                 2-pt functions for a given set of operators                *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
 class TwoPointPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TwoPointPar,
                                    std::vector<std::string>, op,
                                    std::string,              output);
 };
 template <typename SImpl>
 class TTwoPoint: public Module<TwoPointPar>
 {
 public:
    typedef typename SImpl::Field        Field;
    typedef typename SImpl::ComplexField ComplexField;
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::string, sink,
                                        std::string, source,
                                        std::vector<Complex>, data);
    };
 public:
    // constructor
    TTwoPoint(const std::string name);
    // destructor
    virtual ~TTwoPoint(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    // make 2-pt function
    template <class SinkSite, class SourceSite>
    std::vector<Complex> makeTwoPoint(const std::vector<SinkSite>   &sink,
                                      const std::vector<SourceSite> &source);
 };
 MODULE_REGISTER_NS(TwoPointSU2, TTwoPoint<ScalarNxNAdjImplR<2>>, MScalarSUN);
 MODULE_REGISTER_NS(TwoPointSU3, TTwoPoint<ScalarNxNAdjImplR<3>>, MScalarSUN);
 MODULE_REGISTER_NS(TwoPointSU4, TTwoPoint<ScalarNxNAdjImplR<4>>, MScalarSUN);
 MODULE_REGISTER_NS(TwoPointSU5, TTwoPoint<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_NS(TwoPointSU6, TTwoPoint<ScalarNxNAdjImplR<6>>, MScalarSUN);
 /******************************************************************************
 *                 TTwoPoint implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
 TTwoPoint<SImpl>::TTwoPoint(const std::string name)
 : Module<TwoPointPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename SImpl>
 std::vector<std::string> TTwoPoint<SImpl>::getInput(void)
 {   
    return par().op;
 }
 template <typename SImpl>
 std::vector<std::string> TTwoPoint<SImpl>::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TTwoPoint<SImpl>::setup(void)
 {
    const unsigned int nt = env().getDim().back();
    envTmp(std::vector<std::vector<TComplex>>, "slicedOp", 1, par().op.size(), 
           std::vector<TComplex>(nt));
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TTwoPoint<SImpl>::execute(void)
 {
    LOG(Message) << "Computing 2-point functions for operators:" << std::endl;
    for (auto &o: par().op)
    {
        LOG(Message) << "  '" << o << "'" << std::endl;
    }
    ResultWriter        writer(RESULT_FILE_NAME(par().output));
    const unsigned int  nd = env().getDim().size();
    std::vector<Result> result;
    envGetTmp(std::vector<std::vector<TComplex>>, slicedOp);
    for (unsigned int i = 0; i < par().op.size(); ++i)
    {
        auto &op = envGet(ComplexField, par().op[i]);
        sliceSum(op, slicedOp[i], nd - 1);
    }
    for (unsigned int i = 0; i < par().op.size(); ++i)
    for (unsigned int j = 0; j < par().op.size(); ++j)
    {
        Result r;
        r.sink   = par().op[i];
        r.source = par().op[j];
        r.data   = makeTwoPoint(slicedOp[i], slicedOp[j]);
        result.push_back(r);
    }
    write(writer, "twopt", result);
 }
 // make 2-pt function //////////////////////////////////////////////////////////
 template <class SImpl>
 template <class SinkSite, class SourceSite>
 std::vector<Complex> TTwoPoint<SImpl>::makeTwoPoint(
                                  const std::vector<SinkSite>   &sink,
                                  const std::vector<SourceSite> &source)
 {
    assert(sink.size() == source.size());
    unsigned int         nt = sink.size();
    std::vector<Complex> res(nt, 0.);
    for (unsigned int dt = 0; dt < nt; ++dt)
    {
        for (unsigned int t  = 0; t < nt; ++t)
        {
            res[dt] += TensorRemove(trace(sink[(t+dt)%nt]*source[t]));
        }
        res[dt] *= 1./static_cast<double>(nt);
    }
    return res;
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalarSUN_TwoPoint_hpp_
--- a/extras/Hadrons/Modules/MSink/Point.hpp
+++ b/extras/Hadrons/Modules/MSink/Point.hpp
@@ -1,3 +1,32 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MSink/Point.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MSink_Point_hpp_
 #define Hadrons_MSink_Point_hpp_
@@ -33,10 +62,14 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    bool        hasPhase_{false}; 
    std::string momphName_;
 };
 MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSink);
@@ -49,6 +82,7 @@ MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSink);
 template <typename FImpl>
 TPoint<FImpl>::TPoint(const std::string name)
 : Module<PointPar>(name)
 , momphName_ (name + "_momph")
 {}
 // dependencies/products ///////////////////////////////////////////////////////
@@ -72,22 +106,27 @@ std::vector<std::string> TPoint<FImpl>::getOutput(void)
 template <typename FImpl>
 void TPoint<FImpl>::setup(void)
 {
-    unsigned int size;
+    envTmpLat(LatticeComplex, "coor");
-    
+    envCacheLat(LatticeComplex, momphName_);
-    size = env().template lattice4dSize<LatticeComplex>();
+    envCreate(SinkFn, getName(), 1, nullptr);
    env().registerObject(getName(), size);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TPoint<FImpl>::execute(void)
 {   
    std::vector<Real> p = strToVec<Real>(par().mom);
    LatticeComplex    ph(env().getGrid()), coor(env().getGrid());
    Complex           i(0.0,1.0);
    LOG(Message) << "Setting up point sink function for momentum ["
                 << par().mom << "]" << std::endl;
    auto &ph = envGet(LatticeComplex, momphName_);
    if (!hasPhase_)
    {
        Complex           i(0.0,1.0);
        std::vector<Real> p;
        envGetTmp(LatticeComplex, coor);
        p  = strToVec<Real>(par().mom);
        ph = zero;
        for(unsigned int mu = 0; mu < env().getNd(); mu++)
        {
@@ -95,7 +134,9 @@ void TPoint<FImpl>::execute(void)
            ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
        }
        ph = exp((Real)(2*M_PI)*i*ph);
-    auto sink = [ph](const PropagatorField &field)
+        hasPhase_ = true;
    }
    auto sink = [&ph](const PropagatorField &field)
    {
        SlicedPropagator res;
        PropagatorField  tmp = ph*field;
@@ -104,7 +145,7 @@ void TPoint<FImpl>::execute(void)
        return res;
    };
-    env().setObject(getName(), new SinkFn(sink));
+    envGet(SinkFn, getName()) = sink;
 }
 END_MODULE_NAMESPACE
--- a/extras/Hadrons/Modules/MSink/Smear.hpp
+++ b/extras/Hadrons/Modules/MSink/Smear.hpp
@@ -0,0 +1,127 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MSink/Smear.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MSink_Smear_hpp_
 #define Hadrons_MSink_Smear_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                                 Smear                                      *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MSink)
 class SmearPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(SmearPar,
                                    std::string, q,
                                    std::string, sink);
 };
 template <typename FImpl>
 class TSmear: public Module<SmearPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    SINK_TYPE_ALIASES();
 public:
    // constructor
    TSmear(const std::string name);
    // destructor
    virtual ~TSmear(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(Smear, TSmear<FIMPL>, MSink);
 /******************************************************************************
 *                          TSmear implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TSmear<FImpl>::TSmear(const std::string name)
 : Module<SmearPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TSmear<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().sink};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TSmear<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TSmear<FImpl>::setup(void)
 {
    envCreate(SlicedPropagator, getName(), 1, env().getDim(Tp));
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TSmear<FImpl>::execute(void)
 {
    LOG(Message) << "Sink smearing propagator '" << par().q
                 << "' using sink function '" << par().sink << "'."
                 << std::endl;
    auto &sink = envGet(SinkFn, par().sink);
    auto &q    = envGet(PropagatorField, par().q);
    auto &out  = envGet(SlicedPropagator, getName());
    out = sink(q);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MSink_Smear_hpp_
--- a/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
+++ b/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -61,7 +60,9 @@ public:
    virtual ~TRBPrecCG(void) = default;
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getReference(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
@@ -83,11 +84,19 @@ TRBPrecCG<FImpl>::TRBPrecCG(const std::string name)
 template <typename FImpl>
 std::vector<std::string> TRBPrecCG<FImpl>::getInput(void)
 {
-    std::vector<std::string> in = {par().action};
+    std::vector<std::string> in = {};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TRBPrecCG<FImpl>::getReference(void)
 {
    std::vector<std::string> ref = {par().action};
    return ref;
 }
 template <typename FImpl>
 std::vector<std::string> TRBPrecCG<FImpl>::getOutput(void)
 {
@@ -100,17 +109,12 @@ std::vector<std::string> TRBPrecCG<FImpl>::getOutput(void)
 template <typename FImpl>
 void TRBPrecCG<FImpl>::setup(void)
 {
    LOG(Message) << "setting up Schur red-black preconditioned CG for"
                 << " action '" << par().action << "' with residual "
                 << par().residual << std::endl;
    auto Ls     = env().getObjectLs(par().action);
-    
+    auto &mat   = envGet(FMat, par().action);
    env().registerObject(getName(), 0, Ls);
    env().addOwnership(getName(), par().action);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TRBPrecCG<FImpl>::execute(void)
 {
    auto &mat   = *(env().template getObject<FMat>(par().action));
    auto solver = [&mat, this](FermionField &sol, const FermionField &source)
    {
        ConjugateGradient<FermionField>           cg(par().residual, 10000);
@@ -118,13 +122,14 @@ void TRBPrecCG<FImpl>::execute(void)
        schurSolver(mat, source, sol);
    };
-    
+    envCreate(SolverFn, getName(), Ls, solver);
    LOG(Message) << "setting up Schur red-black preconditioned CG for"
                 << " action '" << par().action << "' with residual "
                 << par().residual << std::endl;
    env().setObject(getName(), new SolverFn(solver));
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TRBPrecCG<FImpl>::execute(void)
 {}
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
--- a/extras/Hadrons/Modules/MSource/Point.hpp
+++ b/extras/Hadrons/Modules/MSource/Point.hpp
@@ -4,10 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MSource/Point.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -72,6 +72,7 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
@@ -111,19 +112,20 @@ std::vector<std::string> TPoint<FImpl>::getOutput(void)
 template <typename FImpl>
 void TPoint<FImpl>::setup(void)
 {
-    env().template registerLattice<PropagatorField>(getName());
+    envCreateLat(PropagatorField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TPoint<FImpl>::execute(void)
 {
    std::vector<int> position = strToVec<int>(par().position);
    typename SitePropagator::scalar_object id;
    LOG(Message) << "Creating point source at position [" << par().position
                << "]" << std::endl;
-    PropagatorField &src = *env().template createLattice<PropagatorField>(getName());
+
    std::vector<int> position = strToVec<int>(par().position);
    auto             &src     = envGet(PropagatorField, getName());
    SitePropagator   id;
    id  = 1.;
    src = zero;
    pokeSite(id, src, position);
--- a/extras/Hadrons/Modules/MSource/SeqConserved.hpp
+++ b/extras/Hadrons/Modules/MSource/SeqConserved.hpp
@@ -0,0 +1,160 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MSource/SeqConserved.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MSource_SeqConserved_hpp_
 #define Hadrons_MSource_SeqConserved_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
 Sequential source
 -----------------------------
 * src_x = q_x * theta(x_3 - tA) * theta(tB - x_3) * J_mu * exp(i x.mom)
 * options:
 - q: input propagator (string)
 - action: fermion action used for propagator q (string)
 - tA: begin timeslice (integer)
 - tB: end timesilce (integer)
 - curr_type: type of conserved current to insert (Current)
 - mu: Lorentz index of current to insert (integer)
 - mom: momentum insertion, space-separated float sequence (e.g ".1 .2 1. 0.")
 */
 /******************************************************************************
 *                              SeqConserved                                  *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MSource)
 class SeqConservedPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(SeqConservedPar,
                                    std::string,  q,
                                    std::string,  action,
                                    unsigned int, tA,
                                    unsigned int, tB,
                                    Current,      curr_type,
                                    unsigned int, mu,
                                    std::string,  mom);
 };
 template <typename FImpl>
 class TSeqConserved: public Module<SeqConservedPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TSeqConserved(const std::string name);
    // destructor
    virtual ~TSeqConserved(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(SeqConserved, TSeqConserved<FIMPL>, MSource);
 /******************************************************************************
 *                      TSeqConserved implementation                          *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TSeqConserved<FImpl>::TSeqConserved(const std::string name)
 : Module<SeqConservedPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TSeqConserved<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().action};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TSeqConserved<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TSeqConserved<FImpl>::setup(void)
 {
    auto Ls_ = env().getObjectLs(par().action);
    envCreateLat(PropagatorField, getName(), Ls_);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TSeqConserved<FImpl>::execute(void)
 {
    if (par().tA == par().tB)
    {
        LOG(Message) << "Generating sequential source with conserved "
                     << par().curr_type << " current insertion (mu = " 
                     << par().mu << ") at " << "t = " << par().tA << std::endl;
    }
    else
    {
        LOG(Message) << "Generating sequential source with conserved "
                     << par().curr_type << " current insertion (mu = " 
                     << par().mu << ") for " << par().tA << " <= t <= " 
                     << par().tB << std::endl;
    }
    auto &src = envGet(PropagatorField, getName());
    auto &q   = envGet(PropagatorField, par().q);
    auto &mat = envGet(FMat, par().action);
    std::vector<Real> mom = strToVec<Real>(par().mom);
    mat.SeqConservedCurrent(q, src, par().curr_type, par().mu, 
                            mom, par().tA, par().tB);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_SeqConserved_hpp_
--- a/extras/Hadrons/Modules/MSource/SeqGamma.hpp
+++ b/extras/Hadrons/Modules/MSource/SeqGamma.hpp
@@ -4,11 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MSource/SeqGamma.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Copyright (C) 2017
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -81,10 +80,14 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    bool        hasPhase_{false};
    std::string momphName_, tName_;
 };
 MODULE_REGISTER_NS(SeqGamma, TSeqGamma<FIMPL>, MSource);
@@ -96,6 +99,8 @@ MODULE_REGISTER_NS(SeqGamma, TSeqGamma<FIMPL>, MSource);
 template <typename FImpl>
 TSeqGamma<FImpl>::TSeqGamma(const std::string name)
 : Module<SeqGammaPar>(name)
 , momphName_ (name + "_momph")
 , tName_ (name + "_t")
 {}
 // dependencies/products ///////////////////////////////////////////////////////
@@ -119,7 +124,10 @@ std::vector<std::string> TSeqGamma<FImpl>::getOutput(void)
 template <typename FImpl>
 void TSeqGamma<FImpl>::setup(void)
 {
-    env().template registerLattice<PropagatorField>(getName());
+    envCreateLat(PropagatorField, getName());
    envCacheLat(Lattice<iScalar<vInteger>>, tName_);
    envCacheLat(LatticeComplex, momphName_);
    envTmpLat(LatticeComplex, "coor");
 }
 // execution ///////////////////////////////////////////////////////////////////
@@ -137,23 +145,29 @@ void TSeqGamma<FImpl>::execute(void)
                     << " sequential source for "
                     << par().tA << " <= t <= " << par().tB << std::endl;
    }
-    PropagatorField &src = *env().template createLattice<PropagatorField>(getName());
+    auto  &src = envGet(PropagatorField, getName());
-    PropagatorField &q   = *env().template getObject<PropagatorField>(par().q);
+    auto  &q   = envGet(PropagatorField, par().q);
-    Lattice<iScalar<vInteger>> t(env().getGrid());
+    auto  &ph  = envGet(LatticeComplex, momphName_);
-    LatticeComplex             ph(env().getGrid()), coor(env().getGrid());
+    auto  &t   = envGet(Lattice<iScalar<vInteger>>, tName_);
    Gamma g(par().gamma);
    std::vector<Real>          p;
    Complex                    i(0.0,1.0);
    if (!hasPhase_)
    {
        Complex           i(0.0,1.0);
        std::vector<Real> p;
        envGetTmp(LatticeComplex, coor);
        p  = strToVec<Real>(par().mom);
        ph = zero;
        for(unsigned int mu = 0; mu < env().getNd(); mu++)
        {
            LatticeCoordinate(coor, mu);
-        ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
+            ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
        }
        ph = exp((Real)(2*M_PI)*i*ph);
        LatticeCoordinate(t, Tp);
        hasPhase_ = true;
    }
    src = where((t >= par().tA) and (t <= par().tB), ph*(g*q), 0.*q);
 }
--- a/extras/Hadrons/Modules/MSource/Wall.hpp
+++ b/extras/Hadrons/Modules/MSource/Wall.hpp
@@ -4,9 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MSource/Wall.hpp
-Copyright (C) 2017
+Copyright (C) 2015-2018
-Author: Andrew Lawson <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -73,10 +74,14 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    bool        hasPhase_{false};
    std::string momphName_, tName_;
 };
 MODULE_REGISTER_NS(Wall, TWall<FIMPL>, MSource);
@@ -88,13 +93,15 @@ MODULE_REGISTER_NS(Wall, TWall<FIMPL>, MSource);
 template <typename FImpl>
 TWall<FImpl>::TWall(const std::string name)
 : Module<WallPar>(name)
 , momphName_ (name + "_momph")
 , tName_ (name + "_t")
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TWall<FImpl>::getInput(void)
 {
-    std::vector<std::string> in;
+    std::vector<std::string> in = {};
    return in;
 }
@@ -111,7 +118,7 @@ std::vector<std::string> TWall<FImpl>::getOutput(void)
 template <typename FImpl>
 void TWall<FImpl>::setup(void)
 {
-    env().template registerLattice<PropagatorField>(getName());
+    envCreateLat(PropagatorField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
@@ -121,21 +128,28 @@ void TWall<FImpl>::execute(void)
    LOG(Message) << "Generating wall source at t = " << par().tW 
                 << " with momentum " << par().mom << std::endl;
-    PropagatorField &src = *env().template createLattice<PropagatorField>(getName());
+    auto  &src = envGet(PropagatorField, getName());
-    Lattice<iScalar<vInteger>> t(env().getGrid());
+    auto  &ph  = envGet(LatticeComplex, momphName_);
-    LatticeComplex             ph(env().getGrid()), coor(env().getGrid());
+    auto  &t   = envGet(Lattice<iScalar<vInteger>>, tName_);
    std::vector<Real>          p;
    Complex                    i(0.0,1.0);
    if (!hasPhase_)
    {
        Complex           i(0.0,1.0);
        std::vector<Real> p;
        envGetTmp(LatticeComplex, coor);
        p  = strToVec<Real>(par().mom);
        ph = zero;
-    for(unsigned int mu = 0; mu < Nd; mu++)
+        for(unsigned int mu = 0; mu < env().getNd(); mu++)
        {
            LatticeCoordinate(coor, mu);
-        ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
+            ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
        }
        ph = exp((Real)(2*M_PI)*i*ph);
        LatticeCoordinate(t, Tp);
        hasPhase_ = true;
    }
    src = 1.;
    src = where((t == par().tW), src*ph, 0.*src);
 }
--- a/extras/Hadrons/Modules/MSource/Z2.hpp
+++ b/extras/Hadrons/Modules/MSource/Z2.hpp
@@ -4,8 +4,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: extras/Hadrons/Modules/MSource/Z2.hpp
-Copyright (C) 2015
+Copyright (C) 2015-2018
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -76,10 +75,14 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    bool        hasT_{false};
    std::string tName_;
 };
 MODULE_REGISTER_NS(Z2,       TZ2<FIMPL>,        MSource);
@@ -92,6 +95,7 @@ MODULE_REGISTER_NS(ScalarZ2, TZ2<ScalarImplCR>, MSource);
 template <typename FImpl>
 TZ2<FImpl>::TZ2(const std::string name)
 : Module<Z2Par>(name)
 , tName_ (name + "_t")
 {}
 // dependencies/products ///////////////////////////////////////////////////////
@@ -115,17 +119,15 @@ std::vector<std::string> TZ2<FImpl>::getOutput(void)
 template <typename FImpl>
 void TZ2<FImpl>::setup(void)
 {
-    env().template registerLattice<PropagatorField>(getName());
+    envCreateLat(PropagatorField, getName());
    envCacheLat(Lattice<iScalar<vInteger>>, tName_);
    envTmpLat(LatticeComplex, "eta");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TZ2<FImpl>::execute(void)
 {
    Lattice<iScalar<vInteger>> t(env().getGrid());
    LatticeComplex             eta(env().getGrid());
    Complex                    shift(1., 1.);
    if (par().tA == par().tB)
    {
        LOG(Message) << "Generating Z_2 wall source at t= " << par().tA
@@ -136,8 +138,17 @@ void TZ2<FImpl>::execute(void)
        LOG(Message) << "Generating Z_2 band for " << par().tA << " <= t <= "
                     << par().tB << std::endl;
    }
-    PropagatorField &src = *env().template createLattice<PropagatorField>(getName());
+    
    auto    &src = envGet(PropagatorField, getName());
    auto    &t   = envGet(Lattice<iScalar<vInteger>>, tName_);
    Complex shift(1., 1.);
    if (!hasT_)
    {
        LatticeCoordinate(t, Tp);
        hasT_ = true;
    }
    envGetTmp(LatticeComplex, eta);
    bernoulli(*env().get4dRng(), eta);
    eta = (2.*eta - shift)*(1./::sqrt(2.));
    eta = where((t >= par().tA) and (t <= par().tB), eta, 0.*eta);
--- a/extras/Hadrons/Modules/MUtilities/TestSeqConserved.hpp
+++ b/extras/Hadrons/Modules/MUtilities/TestSeqConserved.hpp
@@ -0,0 +1,186 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MUtilities/TestSeqConserved.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MUtilities_TestSeqConserved_hpp_
 #define Hadrons_MUtilities_TestSeqConserved_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
  Ward Identity contractions using sequential propagators.
 -----------------------------
 * options:
 - q:      point source propagator, 5D if available (string)
 - qSeq:   result of sequential insertion of conserved current using q (string)
 - action: action used for computation of q (string)
 - origin: string giving point source origin of q (string)
 - t_J:    time at which sequential current is inserted (int)
 - mu:     Lorentz index of current inserted (int)
 - curr:   current type, e.g. vector/axial (Current)
 */
 /******************************************************************************
 *                            TestSeqConserved                                *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MUtilities)
 class TestSeqConservedPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TestSeqConservedPar,
                                    std::string,  q,
                                    std::string,  qSeq,
                                    std::string,  action,
                                    std::string,  origin,
                                    unsigned int, t_J,
                                    unsigned int, mu,
                                    Current,      curr);
 };
 template <typename FImpl>
 class TTestSeqConserved: public Module<TestSeqConservedPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TTestSeqConserved(const std::string name);
    // destructor
    virtual ~TTestSeqConserved(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(TestSeqConserved, TTestSeqConserved<FIMPL>, MUtilities);
 /******************************************************************************
 *                     TTestSeqConserved implementation                       *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TTestSeqConserved<FImpl>::TTestSeqConserved(const std::string name)
 : Module<TestSeqConservedPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TTestSeqConserved<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().qSeq, par().action};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TTestSeqConserved<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TTestSeqConserved<FImpl>::setup(void)
 {
    auto Ls = env().getObjectLs(par().q);
    if (Ls != env().getObjectLs(par().action))
    {
        HADRON_ERROR(Size, "Ls mismatch between quark action and propagator");
    }
    envTmpLat(PropagatorField, "tmp");
    envTmpLat(LatticeComplex, "c");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TTestSeqConserved<FImpl>::execute(void)
 {
    // Check sequential insertion of current gives same result as conserved 
    // current sink upon contraction. Assume q uses a point source.
    auto                  &q    = envGet(PropagatorField, par().q);
    auto                  &qSeq = envGet(PropagatorField, par().qSeq);
    auto                  &act  = envGet(FMat, par().action);
    Gamma                 g5(Gamma::Algebra::Gamma5);
    Gamma::Algebra        gA = (par().curr == Current::Axial) ?
                                  Gamma::Algebra::Gamma5 :
                                  Gamma::Algebra::Identity;
    Gamma                 g(gA);
    SitePropagator        qSite;
    Complex               test_S, test_V, check_S, check_V;
    std::vector<TComplex> check_buf;
    std::vector<int>      siteCoord;
    envGetTmp(PropagatorField, tmp);
    envGetTmp(LatticeComplex, c);
    siteCoord = strToVec<int>(par().origin);
    peekSite(qSite, qSeq, siteCoord);
    test_S = trace(qSite*g);
    test_V = trace(qSite*g*Gamma::gmu[par().mu]);
    act.ContractConservedCurrent(q, q, tmp, par().curr, par().mu);
    c = trace(tmp*g);
    sliceSum(c, check_buf, Tp);
    check_S = TensorRemove(check_buf[par().t_J]);
    c = trace(tmp*g*Gamma::gmu[par().mu]);
    sliceSum(c, check_buf, Tp);
    check_V = TensorRemove(check_buf[par().t_J]);
    LOG(Message) << "Test S  = " << abs(test_S)   << std::endl;
    LOG(Message) << "Test V  = " << abs(test_V) << std::endl;
    LOG(Message) << "Check S = " << abs(check_S) << std::endl;
    LOG(Message) << "Check V = " << abs(check_V) << std::endl;
    // Check difference = 0
    check_S -= test_S;
    check_V -= test_V;
    LOG(Message) << "Consistency check for sequential conserved " 
                 << par().curr << " current insertion: " << std::endl; 
    LOG(Message) << "Diff S  = " << abs(check_S) << std::endl;
    LOG(Message) << "Diff V  = " << abs(check_V) << std::endl;
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_TestSeqConserved_hpp_
--- a/extras/Hadrons/Modules/MUtilities/TestSeqGamma.hpp
+++ b/extras/Hadrons/Modules/MUtilities/TestSeqGamma.hpp
@@ -0,0 +1,150 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MUtilities/TestSeqGamma.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MUtilities_TestSeqGamma_hpp_
 #define Hadrons_MUtilities_TestSeqGamma_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                              TestSeqGamma                                  *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MUtilities)
 class TestSeqGammaPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TestSeqGammaPar,
                                    std::string,    q,
                                    std::string,    qSeq,
                                    std::string,    origin,
                                    Gamma::Algebra, gamma,
                                    unsigned int,   t_g);
 };
 template <typename FImpl>
 class TTestSeqGamma: public Module<TestSeqGammaPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TTestSeqGamma(const std::string name);
    // destructor
    virtual ~TTestSeqGamma(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(TestSeqGamma, TTestSeqGamma<FIMPL>, MUtilities);
 /******************************************************************************
 *                      TTestSeqGamma implementation                          *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TTestSeqGamma<FImpl>::TTestSeqGamma(const std::string name)
 : Module<TestSeqGammaPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TTestSeqGamma<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().qSeq};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TTestSeqGamma<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TTestSeqGamma<FImpl>::setup(void)
 {
    envTmpLat(LatticeComplex, "c");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TTestSeqGamma<FImpl>::execute(void)
 {
    auto                  &q    = envGet(PropagatorField, par().q);
    auto                  &qSeq = envGet(PropagatorField, par().qSeq);
    Gamma                 g5(Gamma::Algebra::Gamma5);
    Gamma                 g(par().gamma);
    SitePropagator        qSite;
    Complex               test, check;
    std::vector<TComplex> check_buf;
    std::vector<int>      siteCoord;
    // Check sequential insertion of gamma matrix gives same result as 
    // insertion of gamma at sink upon contraction. Assume q uses a point 
    // source.
    envGetTmp(LatticeComplex, c);
    siteCoord = strToVec<int>(par().origin);
    peekSite(qSite, qSeq, siteCoord);
    test = trace(g*qSite);
    c = trace(adj(g)*g5*adj(q)*g5*g*q);
    sliceSum(c, check_buf, Tp);
    check = TensorRemove(check_buf[par().t_g]);
    LOG(Message) << "Seq Result = " << abs(test)  << std::endl;
    LOG(Message) << "Reference  = " << abs(check) << std::endl;
    // Check difference = 0
    check -= test;
    LOG(Message) << "Consistency check for sequential " << par().gamma  
                 << " insertion = " << abs(check) << std::endl;
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_TestSeqGamma_hpp_
--- a/extras/Hadrons/VirtualMachine.cc
+++ b/extras/Hadrons/VirtualMachine.cc
@@ -0,0 +1,622 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/VirtualMachine.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/VirtualMachine.hpp>
 #include <Grid/Hadrons/GeneticScheduler.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 using namespace Grid;
 using namespace QCD;
 using namespace Hadrons;
 /******************************************************************************
 *                      VirtualMachine implementation                         *
 ******************************************************************************/
 // trajectory counter //////////////////////////////////////////////////////////
 void VirtualMachine::setTrajectory(const unsigned int traj)
 {
    traj_ = traj;
 }
 unsigned int VirtualMachine::getTrajectory(void) const
 {
    return traj_;
 }
 // module management ///////////////////////////////////////////////////////////
 void VirtualMachine::pushModule(VirtualMachine::ModPt &pt)
 {
    std::string name = pt->getName();
    if (!hasModule(name))
    {
        std::vector<unsigned int> inputAddress;
        unsigned int              address;
        ModuleInfo                m;
        // module registration -------------------------------------------------
        m.data = std::move(pt);
        m.type = typeIdPt(*m.data.get());
        m.name = name;
        // input dependencies
        for (auto &in: m.data->getInput())
        {
            if (!env().hasObject(in))
            {
                // if object does not exist, add it with no creator module
                env().addObject(in , -1);
            }
            m.input.push_back(env().getObjectAddress(in));
        }
        // reference dependencies
        for (auto &ref: m.data->getReference())
        {
            if (!env().hasObject(ref))
            {
                // if object does not exist, add it with no creator module
                env().addObject(ref , -1);
            }
            m.input.push_back(env().getObjectAddress(ref));
        }
        auto inCopy = m.input;
        // if module has inputs with references, they need to be added as
        // an input
        for (auto &in: inCopy)
        {
            int inm = env().getObjectModule(in);
            if (inm > 0)
            {
                if (getModule(inm)->getReference().size() > 0)
                {
                    for (auto &rin: getModule(inm)->getReference())
                    {
                        m.input.push_back(env().getObjectAddress(rin));
                    }
                }
            }
        }
        module_.push_back(std::move(m));
        address              = static_cast<unsigned int>(module_.size() - 1);
        moduleAddress_[name] = address;
        // connecting outputs to potential inputs ------------------------------
        for (auto &out: getModule(address)->getOutput())
        {
            if (!env().hasObject(out))
            {
                // output does not exists, add it
                env().addObject(out, address);
            }
            else
            {
                if (env().getObjectModule(env().getObjectAddress(out)) < 0)
                {
                    // output exists but without creator, correct it
                    env().setObjectModule(env().getObjectAddress(out), address);
                }
                else
                {
                    // output already fully registered, error
                    HADRON_ERROR(Definition, "object '" + out
                                 + "' is already produced by module '"
                                 + module_[env().getObjectModule(out)].name
                                 + "' (while pushing module '" + name + "')");
                }
                if (getModule(address)->getReference().size() > 0)
                {
                    // module has references, dependency should be propagated
                    // to children modules; find module with `out` as an input
                    // and add references to their input
                    auto pred = [this, out](const ModuleInfo &n)
                    {
                        auto &in = n.input;
                        auto it  = std::find(in.begin(), in.end(), 
                                             env().getObjectAddress(out));
                        return (it != in.end());
                    };
                    auto it = std::find_if(module_.begin(), module_.end(), pred);
                    while (it != module_.end())
                    {
                        for (auto &ref: getModule(address)->getReference())
                        {
                            it->input.push_back(env().getObjectAddress(ref));
                        }
                        it = std::find_if(++it, module_.end(), pred);
                    }   
                }
            }
        }
        graphOutdated_         = true;
        memoryProfileOutdated_ = true;
    }
    else
    {
        HADRON_ERROR(Definition, "module '" + name + "' already exists");
    }
 }
 unsigned int VirtualMachine::getNModule(void) const
 {
    return module_.size();
 }
 void VirtualMachine::createModule(const std::string name, const std::string type,
                                  XmlReader &reader)
 {
    auto &factory = ModuleFactory::getInstance();
    auto pt       = factory.create(type, name);
    pt->parseParameters(reader, "options");
    pushModule(pt);
 }
 ModuleBase * VirtualMachine::getModule(const unsigned int address) const
 {
    if (hasModule(address))
    {
        return module_[address].data.get();
    }
    else
    {
        HADRON_ERROR(Definition, "no module with address " + std::to_string(address));
    }
 }
 ModuleBase * VirtualMachine::getModule(const std::string name) const
 {
    return getModule(getModuleAddress(name));
 }
 unsigned int VirtualMachine::getModuleAddress(const std::string name) const
 {
    if (hasModule(name))
    {
        return moduleAddress_.at(name);
    }
    else
    {
        HADRON_ERROR(Definition, "no module with name '" + name + "'");
    }
 }
 std::string VirtualMachine::getModuleName(const unsigned int address) const
 {
    if (hasModule(address))
    {
        return module_[address].name;
    }
    else
    {
        HADRON_ERROR(Definition, "no module with address " + std::to_string(address));
    }
 }
 std::string VirtualMachine::getModuleType(const unsigned int address) const
 {
    if (hasModule(address))
    {
        return typeName(module_[address].type);
    }
    else
    {
        HADRON_ERROR(Definition, "no module with address " + std::to_string(address));
    }
 }
 std::string VirtualMachine::getModuleType(const std::string name) const
 {
    return getModuleType(getModuleAddress(name));
 }
 std::string VirtualMachine::getModuleNamespace(const unsigned int address) const
 {
    std::string type = getModuleType(address), ns;
    auto pos2 = type.rfind("::");
    auto pos1 = type.rfind("::", pos2 - 2);
    return type.substr(pos1 + 2, pos2 - pos1 - 2);
 }
 std::string VirtualMachine::getModuleNamespace(const std::string name) const
 {
    return getModuleNamespace(getModuleAddress(name));
 }
 bool VirtualMachine::hasModule(const unsigned int address) const
 {
    return (address < module_.size());
 }
 bool VirtualMachine::hasModule(const std::string name) const
 {
    return (moduleAddress_.find(name) != moduleAddress_.end());
 }
 // print VM content ////////////////////////////////////////////////////////////
 void VirtualMachine::printContent(void) const
 {
    LOG(Debug) << "Modules: " << std::endl;
    for (unsigned int i = 0; i < module_.size(); ++i)
    {
        LOG(Debug) << std::setw(4) << i << ": "
                   << getModuleName(i) << std::endl;
    }
 }
 // module graph ////////////////////////////////////////////////////////////////
 Graph<unsigned int> VirtualMachine::getModuleGraph(void)
 {
    if (graphOutdated_)
    {
        makeModuleGraph();
        graphOutdated_ = false;
    }
    return graph_;
 }
 void VirtualMachine::makeModuleGraph(void)
 {
    Graph<unsigned int> graph;
    // create vertices
    for (unsigned int m = 0; m < module_.size(); ++m)
    {
        graph.addVertex(m);
    }
    // create edges
    for (unsigned int m = 0; m < module_.size(); ++m)
    {
        for (auto &in: module_[m].input)
        {
            graph.addEdge(env().getObjectModule(in), m);
        }
    }
    graph_ = graph;
 }
 // memory profile //////////////////////////////////////////////////////////////
 const VirtualMachine::MemoryProfile & VirtualMachine::getMemoryProfile(void)
 {
    if (memoryProfileOutdated_)
    {
        makeMemoryProfile();
        memoryProfileOutdated_ = false;
    }
    return profile_;
 }
 void VirtualMachine::makeMemoryProfile(void)
 {
    bool protect = env().objectsProtected();
    bool hmsg    = HadronsLogMessage.isActive();
    bool gmsg    = GridLogMessage.isActive();
    bool err     = HadronsLogError.isActive();
    auto program = getModuleGraph().topoSort();
    resetProfile();
    profile_.module.resize(getNModule());
    env().protectObjects(false);
    GridLogMessage.Active(false);
    HadronsLogMessage.Active(false);
    HadronsLogError.Active(false);
    for (auto it = program.rbegin(); it != program.rend(); ++it) 
    {
        auto a = *it;
        if (profile_.module[a].empty())
        {
            LOG(Debug) << "Profiling memory for module '" << module_[a].name
                       << "' (" << a << ")..." << std::endl;
            memoryProfile(a);
            env().freeAll();
        }
    }
    env().protectObjects(protect);
    GridLogMessage.Active(gmsg);
    HadronsLogMessage.Active(hmsg);
    HadronsLogError.Active(err);
    LOG(Debug) << "Memory profile:" << std::endl;
    LOG(Debug) << "----------------" << std::endl;
    for (unsigned int a = 0; a < profile_.module.size(); ++a)
    {
        LOG(Debug) << getModuleName(a) << " (" << a << ")" << std::endl;
        for (auto &o: profile_.module[a])
        {
            LOG(Debug) << "|__ " << env().getObjectName(o.first) << " ("
                       << sizeString(o.second) << ")" << std::endl;
        }
        LOG(Debug) << std::endl;
    }
    LOG(Debug) << "----------------" << std::endl;
 }
 void VirtualMachine::resetProfile(void)
 {
    profile_.module.clear();
    profile_.object.clear();
 }
 void VirtualMachine::resizeProfile(void)
 {
    if (env().getMaxAddress() > profile_.object.size())
    {
        MemoryPrint empty;
        empty.size   = 0;
        empty.module = -1;
        profile_.object.resize(env().getMaxAddress(), empty);
    }
 }
 void VirtualMachine::updateProfile(const unsigned int address)
 {
    resizeProfile();
    for (unsigned int a = 0; a < env().getMaxAddress(); ++a)
    {
        if (env().hasCreatedObject(a) and (profile_.object[a].module == -1))
        {
            profile_.object[a].size     = env().getObjectSize(a);
            profile_.object[a].storage  = env().getObjectStorage(a);
            profile_.object[a].module   = address;
            profile_.module[address][a] = profile_.object[a].size;
            if (env().getObjectModule(a) < 0)
            {
                env().setObjectModule(a, address);
            }
        }
    }
 }
 void VirtualMachine::cleanEnvironment(void)
 {
    resizeProfile();
    for (unsigned int a = 0; a < env().getMaxAddress(); ++a)
    {
        if (env().hasCreatedObject(a) and (profile_.object[a].module == -1))
        {
            env().freeObject(a);
        }
    }
 }
 void VirtualMachine::memoryProfile(const unsigned int address)
 {
    auto m = getModule(address);
    LOG(Debug) << "Setting up module '" << m->getName() 
               << "' (" << address << ")..." << std::endl;
    try
    {
        m->setup();
        updateProfile(address);
    }
    catch (Exceptions::Definition &)
    {
        cleanEnvironment();
        for (auto &in: m->getInput())
        {
            memoryProfile(env().getObjectModule(in));
        }
        for (auto &ref: m->getReference())
        {
            memoryProfile(env().getObjectModule(ref));
        }
        m->setup();
        updateProfile(address);
    }
 }
 void VirtualMachine::memoryProfile(const std::string name)
 {
    memoryProfile(getModuleAddress(name));
 }
 // garbage collector ///////////////////////////////////////////////////////////
 VirtualMachine::GarbageSchedule 
 VirtualMachine::makeGarbageSchedule(const Program &p) const
 {
    GarbageSchedule freeProg;
    freeProg.resize(p.size());
    for (unsigned int a = 0; a < env().getMaxAddress(); ++a)
    {
        if (env().getObjectStorage(a) == Environment::Storage::temporary)
        {
            auto it = std::find(p.begin(), p.end(), env().getObjectModule(a));
            if (it != p.end())
            {
                freeProg[std::distance(p.begin(), it)].insert(a);
            }
        }
        else if (env().getObjectStorage(a) == Environment::Storage::object)
        {
            auto pred = [a, this](const unsigned int b)
            {
                auto &in = module_[b].input;
                auto it  = std::find(in.begin(), in.end(), a);
                return (it != in.end()) or (b == env().getObjectModule(a));
            };
            auto it = std::find_if(p.rbegin(), p.rend(), pred);
            if (it != p.rend())
            {
                freeProg[std::distance(it, p.rend()) - 1].insert(a);
            }
        }
    }
    return freeProg;
 }
 // high-water memory function //////////////////////////////////////////////////
 VirtualMachine::Size VirtualMachine::memoryNeeded(const Program &p)
 {
    const MemoryProfile &profile = getMemoryProfile();
    GarbageSchedule     freep    = makeGarbageSchedule(p);
    Size                current = 0, max = 0;
    for (unsigned int i = 0; i < p.size(); ++i)
    {
        for (auto &o: profile.module[p[i]])
        {
            current += o.second;
        }
        max = std::max(current, max);
        for (auto &o: freep[i])
        {
            current -= profile.object[o].size;
        }
    }
    return max;
 }
 // genetic scheduler ///////////////////////////////////////////////////////////
 VirtualMachine::Program VirtualMachine::schedule(const GeneticPar &par)
 {
    typedef GeneticScheduler<Size, unsigned int> Scheduler;
    auto graph = getModuleGraph();
    //constrained topological sort using a genetic algorithm
    LOG(Message) << "Scheduling computation..." << std::endl;
    LOG(Message) << "               #module= " << graph.size() << std::endl;
    LOG(Message) << "       population size= " << par.popSize << std::endl;
    LOG(Message) << "       max. generation= " << par.maxGen << std::endl;
    LOG(Message) << "  max. cst. generation= " << par.maxCstGen << std::endl;
    LOG(Message) << "         mutation rate= " << par.mutationRate << std::endl;
    unsigned int          k = 0, gen, prevPeak, nCstPeak = 0;
    std::random_device    rd;
    Scheduler::Parameters gpar;
    gpar.popSize      = par.popSize;
    gpar.mutationRate = par.mutationRate;
    gpar.seed         = rd();
    CartesianCommunicator::BroadcastWorld(0, &(gpar.seed), sizeof(gpar.seed));
    Scheduler::ObjFunc memPeak = [this](const Program &p)->Size
    {
        return memoryNeeded(p);
    };
    Scheduler scheduler(graph, memPeak, gpar);
    gen = 0;
    do
    {
        LOG(Debug) << "Generation " << gen << ":" << std::endl;
        scheduler.nextGeneration();
        if (gen != 0)
        {
            if (prevPeak == scheduler.getMinValue())
            {
                nCstPeak++;
            }
            else
            {
                nCstPeak = 0;
            }
        }
        prevPeak = scheduler.getMinValue();
        if (gen % 10 == 0)
        {
            LOG(Iterative) << "Generation " << gen << ": "
                           << sizeString(scheduler.getMinValue()) << std::endl;
        }
        gen++;
    } while ((gen < par.maxGen) and (nCstPeak < par.maxCstGen));
    return scheduler.getMinSchedule();
 }
 // general execution ///////////////////////////////////////////////////////////
 #define BIG_SEP "==============="
 #define SEP     "---------------"
 #define MEM_MSG(size) sizeString(size)
 void VirtualMachine::executeProgram(const Program &p) const
 {
    Size            memPeak = 0, sizeBefore, sizeAfter;
    GarbageSchedule freeProg;
    // build garbage collection schedule
    LOG(Debug) << "Building garbage collection schedule..." << std::endl;
    freeProg = makeGarbageSchedule(p);
    // program execution
    LOG(Debug) << "Executing program..." << std::endl;
    for (unsigned int i = 0; i < p.size(); ++i)
    {
        // execute module
        LOG(Message) << SEP << " Measurement step " << i + 1 << "/"
                     << p.size() << " (module '" << module_[p[i]].name
                     << "') " << SEP << std::endl;
        (*module_[p[i]].data)();
        sizeBefore = env().getTotalSize();
        // print used memory after execution
        LOG(Message) << "Allocated objects: " << MEM_MSG(sizeBefore)
                     << std::endl;
        if (sizeBefore > memPeak)
        {
            memPeak = sizeBefore;
        }
        // garbage collection for step i
        LOG(Message) << "Garbage collection..." << std::endl;
        for (auto &j: freeProg[i])
        {
            env().freeObject(j);
        }
        // print used memory after garbage collection if necessary
        sizeAfter = env().getTotalSize();
        if (sizeBefore != sizeAfter)
        {
            LOG(Message) << "Allocated objects: " << MEM_MSG(sizeAfter)
                            << std::endl;
        }
        else
        {
            LOG(Message) << "Nothing to free" << std::endl;
        }
    }
 }
 void VirtualMachine::executeProgram(const std::vector<std::string> &p) const
 {
    Program pAddress;
    for (auto &n: p)
    {
        pAddress.push_back(getModuleAddress(n));
    }
    executeProgram(pAddress);
 }
--- a/extras/Hadrons/VirtualMachine.hpp
+++ b/extras/Hadrons/VirtualMachine.hpp
@@ -0,0 +1,207 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/VirtualMachine.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_VirtualMachine_hpp_
 #define Hadrons_VirtualMachine_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Graph.hpp>
 #include <Grid/Hadrons/Environment.hpp>
 BEGIN_HADRONS_NAMESPACE
 #define DEFINE_VM_ALIAS \
 inline VirtualMachine & vm(void) const\
 {\
    return VirtualMachine::getInstance();\
 }
 /******************************************************************************
 *                   Virtual machine for module execution                     *
 ******************************************************************************/
 // forward declaration of Module
 class ModuleBase;
 class VirtualMachine
 {
    SINGLETON_DEFCTOR(VirtualMachine);
 public:
    typedef SITE_SIZE_TYPE                      Size;
    typedef std::unique_ptr<ModuleBase>         ModPt;
    typedef std::vector<std::set<unsigned int>> GarbageSchedule;
    typedef std::vector<unsigned int>           Program;
    struct MemoryPrint
    {
        Size                 size;
        Environment::Storage storage;
        int                  module;
    };
    struct MemoryProfile
    {
        std::vector<std::map<unsigned int, Size>> module;
        std::vector<MemoryPrint>                  object;
    };
    class GeneticPar: Serializable
    {
    public:
        GeneticPar(void):
            popSize{20}, maxGen{1000}, maxCstGen{100}, mutationRate{.1} {};
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(GeneticPar,
                                        unsigned int, popSize,
                                        unsigned int, maxGen,
                                        unsigned int, maxCstGen,
                                        double      , mutationRate);
    };
 private:
    struct ModuleInfo
    {
        const std::type_info      *type{nullptr};
        std::string               name;
        ModPt                     data{nullptr};
        std::vector<unsigned int> input;
        size_t                    maxAllocated;
    };
 public:
    // trajectory counter
    void                setTrajectory(const unsigned int traj);
    unsigned int        getTrajectory(void) const;
    // module management
    void                pushModule(ModPt &pt);
    template <typename M>
    void                createModule(const std::string name);
    template <typename M>
    void                createModule(const std::string name,
                                         const typename M::Par &par);
    void                createModule(const std::string name,
                                         const std::string type,
                                         XmlReader &reader);
    unsigned int        getNModule(void) const;
    ModuleBase *        getModule(const unsigned int address) const;
    ModuleBase *        getModule(const std::string name) const;
    template <typename M>
    M *                 getModule(const unsigned int address) const;
    template <typename M>
    M *                 getModule(const std::string name) const;
    unsigned int        getModuleAddress(const std::string name) const;
    std::string         getModuleName(const unsigned int address) const;
    std::string         getModuleType(const unsigned int address) const;
    std::string         getModuleType(const std::string name) const;
    std::string         getModuleNamespace(const unsigned int address) const;
    std::string         getModuleNamespace(const std::string name) const;
    bool                hasModule(const unsigned int address) const;
    bool                hasModule(const std::string name) const;
    // print VM content
    void                printContent(void) const;
    // module graph (could be a const reference if topoSort was const)
    Graph<unsigned int> getModuleGraph(void);
    // memory profile
    const MemoryProfile &getMemoryProfile(void);
    // garbage collector
    GarbageSchedule     makeGarbageSchedule(const Program &p) const;
    // high-water memory function
    Size                memoryNeeded(const Program &p);
    // genetic scheduler
    Program             schedule(const GeneticPar &par);
    // general execution
    void                executeProgram(const Program &p) const;
    void                executeProgram(const std::vector<std::string> &p) const;
 private:
    // environment shortcut
    DEFINE_ENV_ALIAS;
    // module graph
    void makeModuleGraph(void);
    // memory profile
    void makeMemoryProfile(void);
    void resetProfile(void);
    void resizeProfile(void);
    void updateProfile(const unsigned int address);
    void cleanEnvironment(void);
    void memoryProfile(const std::string name);
    void memoryProfile(const unsigned int address);
 private:
    // general
    unsigned int                        traj_;
    // module and related maps
    std::vector<ModuleInfo>             module_;
    std::map<std::string, unsigned int> moduleAddress_;
    std::string                         currentModule_{""};
    // module graph
    bool                                graphOutdated_{true};
    Graph<unsigned int>                 graph_;
    // memory profile
    bool                                memoryProfileOutdated_{true};
    MemoryProfile                       profile_;
 };
 /******************************************************************************
 *                   VirtualMachine template implementation                   *
 ******************************************************************************/
 // module management ///////////////////////////////////////////////////////////
 template <typename M>
 void VirtualMachine::createModule(const std::string name)
 {
    ModPt pt(new M(name));
    pushModule(pt);
 }
 template <typename M>
 void VirtualMachine::createModule(const std::string name,
                               const typename M::Par &par)
 {
    ModPt pt(new M(name));
    static_cast<M *>(pt.get())->setPar(par);
    pushModule(pt);
 }
 template <typename M>
 M * VirtualMachine::getModule(const unsigned int address) const
 {
    if (auto *pt = dynamic_cast<M *>(getModule(address)))
    {
        return pt;
    }
    else
    {
        HADRON_ERROR(Definition, "module '" + module_[address].name
                     + "' does not have type " + typeid(M).name()
                     + "(has type: " + getModuleType(address) + ")");
    }
 }
 template <typename M>
 M * VirtualMachine::getModule(const std::string name) const
 {
    return getModule<M>(getModuleAddress(name));
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_VirtualMachine_hpp_
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@@ -1,38 +1,53 @@
 modules_cc =\
  Modules/MContraction/WeakHamiltonianEye.cc \
  Modules/MContraction/WeakHamiltonianNonEye.cc \
  Modules/MContraction/WeakNeutral4ptDisc.cc \
  Modules/MGauge/Load.cc \
  Modules/MGauge/Random.cc \
  Modules/MGauge/StochEm.cc \
  Modules/MGauge/Unit.cc \
  Modules/MScalar/ChargedProp.cc \
-  Modules/MScalar/FreeProp.cc
+  Modules/MScalar/FreeProp.cc \
  Modules/MContraction/WeakHamiltonianEye.cc \
  Modules/MContraction/WeakNeutral4ptDisc.cc \
  Modules/MContraction/WeakHamiltonianNonEye.cc \
  Modules/MGauge/Unit.cc \
  Modules/MGauge/StochEm.cc \
  Modules/MGauge/Random.cc \
  Modules/MGauge/FundtoHirep.cc \
  Modules/MScalar/FreeProp.cc \
  Modules/MScalar/ChargedProp.cc \
  Modules/MIO/LoadNersc.cc
 modules_hpp =\
  Modules/MAction/DWF.hpp \
  Modules/MAction/Wilson.hpp \
  Modules/MContraction/Baryon.hpp \
  Modules/MContraction/DiscLoop.hpp \
  Modules/MContraction/Gamma3pt.hpp \
  Modules/MContraction/Meson.hpp \
  Modules/MContraction/WeakHamiltonian.hpp \
  Modules/MContraction/WeakHamiltonianEye.hpp \
  Modules/MContraction/WeakHamiltonianNonEye.hpp \
  Modules/MContraction/DiscLoop.hpp \
  Modules/MContraction/WeakNeutral4ptDisc.hpp \
  Modules/MContraction/Gamma3pt.hpp \
  Modules/MContraction/WardIdentity.hpp \
  Modules/MContraction/WeakHamiltonianEye.hpp \
  Modules/MFermion/GaugeProp.hpp \
-  Modules/MGauge/Load.hpp \
+  Modules/MSource/SeqGamma.hpp \
-  Modules/MGauge/Random.hpp \
+  Modules/MSource/Point.hpp \
-  Modules/MGauge/StochEm.hpp \
+  Modules/MSource/Wall.hpp \
-  Modules/MGauge/Unit.hpp \
+  Modules/MSource/Z2.hpp \
-  Modules/MLoop/NoiseLoop.hpp \
+  Modules/MSource/SeqConserved.hpp \
-  Modules/MScalar/ChargedProp.hpp \
+  Modules/MSink/Smear.hpp \
  Modules/MScalar/FreeProp.hpp \
  Modules/MScalar/Scalar.hpp \
  Modules/MSink/Point.hpp \
  Modules/MSolver/RBPrecCG.hpp \
-  Modules/MSource/Point.hpp \
+  Modules/MGauge/Unit.hpp \
-  Modules/MSource/SeqGamma.hpp \
+  Modules/MGauge/Random.hpp \
-  Modules/MSource/Wall.hpp \
+  Modules/MGauge/StochEm.hpp \
-  Modules/MSource/Z2.hpp
+  Modules/MGauge/FundtoHirep.hpp \
  Modules/MUtilities/TestSeqGamma.hpp \
  Modules/MUtilities/TestSeqConserved.hpp \
  Modules/MLoop/NoiseLoop.hpp \
  Modules/MScalar/FreeProp.hpp \
  Modules/MScalar/Scalar.hpp \
  Modules/MScalar/ChargedProp.hpp \
  Modules/MAction/DWF.hpp \
  Modules/MAction/Wilson.hpp \
  Modules/MAction/WilsonClover.hpp \
  Modules/MScalarSUN/Div.hpp \
  Modules/MScalarSUN/TrMag.hpp \
  Modules/MScalarSUN/TwoPoint.hpp \
  Modules/MScalarSUN/TrPhi.hpp \
  Modules/MIO/LoadNersc.hpp \
  Modules/MIO/LoadBinary.hpp
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -1,28 +1,18 @@
 extra_sources=
 extra_headers=
 if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_mpi3.cc
  extra_sources+=communicator/Communicator_base.cc
-endif
+  extra_sources+=communicator/SharedMemoryMPI.cc
-
+  extra_sources+=communicator/SharedMemory.cc
 if BUILD_COMMS_MPIT
  extra_sources+=communicator/Communicator_mpit.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_SHMEM
  extra_sources+=communicator/Communicator_shmem.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_NONE
  extra_sources+=communicator/Communicator_none.cc
  extra_sources+=communicator/Communicator_base.cc
  extra_sources+=communicator/SharedMemoryNone.cc
  extra_sources+=communicator/SharedMemory.cc
 endif
 if BUILD_HDF5
--- a/lib/algorithms/Algorithms.h
+++ b/lib/algorithms/Algorithms.h
@@ -39,10 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
-#include <Grid/algorithms/densematrix/DenseMatrix.h>
+#include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/densematrix/Francis.h>
 #include <Grid/algorithms/densematrix/Householder.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -184,10 +184,12 @@ namespace Grid {
      virtual  RealD MpcDag   (const Field &in, Field &out) =0;
      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
      Field tmp(in._grid);
      tmp.checkerboard = in.checkerboard;
 	ni=Mpc(in,tmp);
 	no=MpcDag(tmp,out);
      }
      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
      out.checkerboard = in.checkerboard;
 	MpcDagMpc(in,out,n1,n2);
      }
      virtual void HermOp(const Field &in, Field &out){
@@ -216,12 +218,14 @@ namespace Grid {
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
      Field tmp(in._grid);
      tmp.checkerboard = !in.checkerboard;
 	//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);
      //std::cout << "cb in " << in.checkerboard << "  cb out " << out.checkerboard << std::endl;
 	_Mat.Mooee(in,out);
 	return axpy_norm(out,-1.0,tmp,out);
      }
--- a/lib/algorithms/iterative/Deflation.h
+++ b/lib/algorithms/iterative/Deflation.h
@@ -0,0 +1,101 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_DEFLATION_H
 #define GRID_DEFLATION_H
 namespace Grid { 
 struct ZeroGuesser {
 public:
  template<class Field> 
  void operator()(const Field &src,Field &guess) { guess = Zero(); };
 };
 struct SourceGuesser {
 public:
  template<class Field> 
  void operator()(const Field &src,Field &guess) { guess = src; };
 };
 ////////////////////////////////
 // Fine grid deflation
 ////////////////////////////////
 template<class Field>
 struct DeflatedGuesser {
 private:
  const std::vector<Field> &evec;
  const std::vector<RealD> &eval;
 public:
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
  void operator()(const Field &src,Field &guess) { 
    guess = zero;
    assert(evec.size()==eval.size());
    auto N = evec.size();
    for (int i=0;i<N;i++) {
      const Field& tmp = evec[i];
      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
    }
  }
 };
 template<class FineField, class CoarseField>
 class LocalCoherenceDeflatedGuesser {
 private:
  const std::vector<FineField>   &subspace;
  const std::vector<CoarseField> &evec_coarse;
  const std::vector<RealD>       &eval_coarse;
 public:
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
    : subspace(_subspace), 
      evec_coarse(_evec_coarse), 
      eval_coarse(_eval_coarse)  
  {
  }
  void operator()(const FineField &src,FineField &guess) { 
    int N = (int)evec_coarse.size();
    CoarseField src_coarse(evec_coarse[0]._grid);
    CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero;
    blockProject(src_coarse,src,subspace);    
    for (int i=0;i<N;i++) {
      const CoarseField & tmp = evec_coarse[i];
      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
    }
    blockPromote(guess_coarse,guess,subspace);
  };
 };
 }
 #endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -149,19 +149,6 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
  basisReorderInPlace(_v,sort_vals,idx);
 }
 // PAB: faster to compute the inner products first then fuse loops.
 // If performance critical can improve.
 template<class Field>
 void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
  result = zero;
  assert(_v.size()==eval.size());
  int N = (int)_v.size();
  for (int i=0;i<N;i++) {
    Field& tmp = _v[i];
    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
  }
 }
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
@@ -181,6 +168,7 @@ enum IRLdiagonalisation {
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
 public:
  LinearFunction<Field>       &_HermOp;
  ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
  int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
@@ -243,6 +231,7 @@ class ImplicitlyRestartedLanczos {
  /////////////////////////
 public:       
  //////////////////////////////////////////////////////////////////
  // PAB:
  //////////////////////////////////////////////////////////////////
--- a/lib/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/lib/algorithms/iterative/LocalCoherenceLanczos.h
@@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #ifndef GRID_LOCAL_COHERENCE_IRL_H
 #define GRID_LOCAL_COHERENCE_IRL_H
 namespace Grid { 
 struct LanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
@@ -70,21 +73,24 @@ public:
  typedef Lattice<Fobj>          FineField;
  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;
-  ProjectedHermOp(LinearOperatorBase<FineField>& linop,  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
+  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
-    _Linop(linop),
+    _Linop(linop), subspace(_subspace)
-    _Aggregate(aggregate)  {  };
+  {  
    assert(subspace.size() >0);
  };
  void operator()(const CoarseField& in, CoarseField& out) {
    GridBase *FineGrid = subspace[0]._grid;    
    int   checkerboard = subspace[0].checkerboard;
-    GridBase *FineGrid = _Aggregate.FineGrid;
+    FineField fin (FineGrid);     fin.checkerboard= checkerboard;
-    FineField fin(FineGrid);
+    FineField fout(FineGrid);   fout.checkerboard = checkerboard;
    FineField fout(FineGrid);
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
+    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
    _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
+    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
  }
 };
@@ -99,24 +105,27 @@ public:
  OperatorFunction<FineField>   & _poly;
  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;
-  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop, 
+  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
-			  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
+			  LinearOperatorBase<FineField>& linop, 
 			  std::vector<FineField> & _subspace) :
    _poly(poly),
    _Linop(linop),
-    _Aggregate(aggregate)  {  };
+    subspace(_subspace)
  {  };
  void operator()(const CoarseField& in, CoarseField& out) {
-    GridBase *FineGrid = _Aggregate.FineGrid;
+    GridBase *FineGrid = subspace[0]._grid;    
    int   checkerboard = subspace[0].checkerboard;
-    FineField fin(FineGrid) ;fin.checkerboard  =_Aggregate.checkerboard;
+    FineField fin (FineGrid); fin.checkerboard =checkerboard;
-    FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard;
+    FineField fout(FineGrid);fout.checkerboard =checkerboard;
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
+    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
    _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
+    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
  }
 };
@@ -132,19 +141,23 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
  LinearFunction<CoarseField> & _Poly;
  OperatorFunction<FineField>   & _smoother;
  LinearOperatorBase<FineField> &_Linop;
  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
  RealD                          _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
-					   Aggregation<Fobj,CComplex,nbasis> &Aggregate,
+					   std::vector<FineField>        &subspace,
 					   RealD coarse_relax_tol=5.0e3) 
-    : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol)  {    };
+    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
      _coarse_relax_tol(coarse_relax_tol)  
  {    };
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
    RealD eval_poly = eval;
    // Apply operator
    _Poly(B,v);
@@ -168,14 +181,13 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
  }
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
-    GridBase *FineGrid = _Aggregate.FineGrid;
+    GridBase *FineGrid = _subspace[0]._grid;    
-
+    int checkerboard   = _subspace[0].checkerboard;
    int checkerboard   = _Aggregate.checkerboard;
    FineField fB(FineGrid);fB.checkerboard =checkerboard;
    FineField fv(FineGrid);fv.checkerboard =checkerboard;
-    _Aggregate.PromoteFromSubspace(B,fv);
+    blockPromote(B,fv,_subspace);  
    _smoother(_Linop,fv,fB); 
    RealD eval_poly = eval;
@@ -217,27 +229,65 @@ protected:
  int _checkerboard;
  LinearOperatorBase<FineField>                 & _FineOp;
-  // FIXME replace Aggregation with vector of fine; the code reuse is too small for
+  std::vector<RealD>                              &evals_fine;
-  // the hassle and complexity of cross coupling.
+  std::vector<RealD>                              &evals_coarse; 
-  Aggregation<Fobj,CComplex,nbasis>               _Aggregate;  
+  std::vector<FineField>                          &subspace;
-  std::vector<RealD>                              evals_fine;
+  std::vector<CoarseField>                        &evec_coarse;
-  std::vector<RealD>                              evals_coarse; 
+
-  std::vector<CoarseField>                        evec_coarse;
+private:
  std::vector<RealD>                              _evals_fine;
  std::vector<RealD>                              _evals_coarse; 
  std::vector<FineField>                          _subspace;
  std::vector<CoarseField>                        _evec_coarse;
 public:
  LocalCoherenceLanczos(GridBase *FineGrid,
 			GridBase *CoarseGrid,
 			LinearOperatorBase<FineField> &FineOp,
 			int checkerboard) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
    _Aggregate(CoarseGrid,FineGrid,checkerboard),
    _FineOp(FineOp),
-    _checkerboard(checkerboard)
+    _checkerboard(checkerboard),
    evals_fine  (_evals_fine),
    evals_coarse(_evals_coarse),
    subspace    (_subspace),
    evec_coarse(_evec_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
-  void Orthogonalise(void ) { _Aggregate.Orthogonalise(); }
+  //////////////////////////////////////////////////////////////////////////
  // Alternate constructore, external storage for use by Hadrons module
  //////////////////////////////////////////////////////////////////////////
  LocalCoherenceLanczos(GridBase *FineGrid,
 			GridBase *CoarseGrid,
 			LinearOperatorBase<FineField> &FineOp,
 			int checkerboard,
 			std::vector<FineField>   &ext_subspace,
 			std::vector<CoarseField> &ext_coarse,
 			std::vector<RealD>       &ext_eval_fine,
 			std::vector<RealD>       &ext_eval_coarse
 			) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
    _FineOp(FineOp),
    _checkerboard(checkerboard),
    evals_fine  (ext_eval_fine), 
    evals_coarse(ext_eval_coarse),
    subspace    (ext_subspace),
    evec_coarse (ext_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid); 
    blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
  };
  template<typename T>  static RealD normalise(T& v) 
  {
@@ -246,43 +296,44 @@ public:
    v = v * (1.0/nn);
    return nn;
  }
-
+  /*
  void fakeFine(void)
  {
    int Nk = nbasis;
-    _Aggregate.subspace.resize(Nk,_FineGrid);
+    subspace.resize(Nk,_FineGrid);
-    _Aggregate.subspace[0]=1.0;
+    subspace[0]=1.0;
-    _Aggregate.subspace[0].checkerboard=_checkerboard;
+    subspace[0].checkerboard=_checkerboard;
-    normalise(_Aggregate.subspace[0]);
+    normalise(subspace[0]);
    PlainHermOp<FineField>    Op(_FineOp);
    for(int k=1;k<Nk;k++){
-      _Aggregate.subspace[k].checkerboard=_checkerboard;
+      subspace[k].checkerboard=_checkerboard;
-      Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]);
+      Op(subspace[k-1],subspace[k]);
-      normalise(_Aggregate.subspace[k]);
+      normalise(subspace[k]);
    }
  }
  */
  void testFine(RealD resid) 
  {
    assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
    PlainHermOp<FineField>    Op(_FineOp);
    ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
    for(int k=0;k<nbasis;k++){
-      assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1);
+      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
    }
  }
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_subspace);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
    for(int k=0;k<evec_coarse.size();k++){
      if ( k < nbasis ) { 
@@ -302,34 +353,34 @@ public:
    PlainHermOp<FineField>    Op(_FineOp);
    evals_fine.resize(Nm);
-    _Aggregate.subspace.resize(Nm,_FineGrid);
+    subspace.resize(Nm,_FineGrid);
    ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
    int Nconv;
-    IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
+    IRL.calc(evals_fine,subspace,src,Nconv,false);
    // Shrink down to number saved
    assert(Nstop>=nbasis);
    assert(Nconv>=nbasis);
    evals_fine.resize(nbasis);
-    _Aggregate.subspace.resize(nbasis,_FineGrid);
+    subspace.resize(nbasis,_FineGrid);
  }
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
    Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_Aggregate);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_subspace);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_subspace);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_subspace,relax);
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -108,6 +108,11 @@ namespace Grid {
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -129,7 +134,6 @@ namespace Grid {
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
      /////////////////////////////////////////////////////
@@ -146,6 +150,7 @@ namespace Grid {
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
      guess(src_o,sol_o);
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;
@@ -190,6 +195,11 @@ namespace Grid {
  };
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -225,6 +235,7 @@ namespace Grid {
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
      guess(src_o,sol_o);
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      ///////////////////////////////////////////////////
@@ -269,6 +280,11 @@ namespace Grid {
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix,class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -305,6 +321,7 @@ namespace Grid {
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      guess(src_o,tmp);
      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
@@ -348,6 +365,11 @@ namespace Grid {
    template<class Matrix>
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -385,6 +407,7 @@ namespace Grid {
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 //      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      guess(src_o,tmp);
      _HermitianRBSolver(src_o,tmp);  assert(tmp.checkerboard==Odd);
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
--- a/lib/allocator/AlignedAllocator.cc
+++ b/lib/allocator/AlignedAllocator.cc
@@ -3,6 +3,9 @@
 namespace Grid {
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 int PointerCache::victim;
 PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
@@ -94,4 +97,29 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
 #endif
 }
 std::string sizeString(const size_t bytes)
 {
  constexpr unsigned int bufSize = 256;
  const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
  char                   buf[256];
  size_t                 s     = 0;
  double                 count = bytes;
  while (count >= 1024 && s < 7)
  {
      s++;
      count /= 1024;
  }
  if (count - floor(count) == 0.0)
  {
      snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
  }
  else
  {
      snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
  }
  return std::string(buf);
 }
 }
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -64,6 +64,64 @@ namespace Grid {
  };
  std::string sizeString(size_t bytes);
  struct MemoryStats
  {
    size_t totalAllocated{0}, maxAllocated{0}, 
           currentlyAllocated{0}, totalFreed{0};
  };
  class MemoryProfiler
  {
  public:
    static MemoryStats *stats;
    static bool        debug;
  };
  #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
  #define profilerDebugPrint \
  if (MemoryProfiler::stats)\
  {\
    auto s = MemoryProfiler::stats;\
    std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\
    std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
              << std::endl;\
    std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
              << std::endl;\
    std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
              << std::endl;\
    std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
              << std::endl;\
  }
  #define profilerAllocate(bytes)\
  if (MemoryProfiler::stats)\
  {\
    auto s = MemoryProfiler::stats;\
    s->totalAllocated     += (bytes);\
    s->currentlyAllocated += (bytes);\
    s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated);\
  }\
  if (MemoryProfiler::debug)\
  {\
    std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\
    profilerDebugPrint;\
  }
  #define profilerFree(bytes)\
  if (MemoryProfiler::stats)\
  {\
    auto s = MemoryProfiler::stats;\
    s->totalFreed         += (bytes);\
    s->currentlyAllocated -= (bytes);\
  }\
  if (MemoryProfiler::debug)\
  {\
    std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\
    profilerDebugPrint;\
  }
  void check_huge_pages(void *Buf,uint64_t BYTES);
 ////////////////////////////////////////////////////////////////////
@@ -92,6 +150,7 @@ public:
  pointer allocate(size_type __n, const void* _p= 0)
  { 
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
    //    if ( ptr != NULL ) 
@@ -122,6 +181,8 @@ public:
  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n * sizeof(_Tp);
    profilerFree(bytes);
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 #ifdef HAVE_MM_MALLOC_H
@@ -172,10 +233,13 @@ public:
 #ifdef GRID_COMMS_SHMEM
  pointer allocate(size_type __n, const void* _p= 0)
  {
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
 #ifdef CRAY
-    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
+    _Tp *ptr = (_Tp *) shmem_align(bytes,64);
 #else
-    _Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
+    _Tp *ptr = (_Tp *) shmem_align(64,bytes);
 #endif
 #ifdef PARANOID_SYMMETRIC_HEAP
    static void * bcast;
@@ -193,18 +257,23 @@ public:
 #endif 
    return ptr;
  }
-  void deallocate(pointer __p, size_type) { 
+  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n*sizeof(_Tp);
    profilerFree(bytes);
    shmem_free((void *)__p);
  }
 #else
  pointer allocate(size_type __n, const void* _p= 0) 
  {
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
 #else
    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
 #endif
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
 #else
    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
 #endif
    uint8_t *cp = (uint8_t *)ptr;
    if ( ptr ) { 
    // One touch per 4k page, static OMP loop to catch same loop order
@@ -215,7 +284,10 @@ public:
    }
    return ptr;
  }
-  void deallocate(pointer __p, size_type) { 
+  void deallocate(pointer __p, size_type __n) {
    size_type bytes = __n*sizeof(_Tp);
    profilerFree(bytes);
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -59,6 +59,7 @@ public:
    virtual ~GridBase() = default;
    // Physics Grid information.
    std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
    std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
@@ -78,6 +79,8 @@ public:
    std::vector<int> _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
    std::vector<int> _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
    bool _isCheckerBoarded; 
 public:
    ////////////////////////////////////////////////////////////////
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@@ -97,6 +97,7 @@ public:
      ///////////////////////
      // Grid information
      ///////////////////////
      _isCheckerBoarded = false;
      _ndimension = dimensions.size();
      _fdimensions.resize(_ndimension);
@@ -122,6 +123,7 @@ public:
        // Use a reduced simd grid
        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
        //std::cout << _ldimensions[d] << "  " << _gdimensions[d] << "  " << _processors[d] << std::endl;
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
@@ -166,6 +168,7 @@ public:
        block = block * _rdimensions[d];
      }
    };
 };
 }
 #endif
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -171,9 +171,8 @@ public:
              const std::vector<int> &checker_dim_mask,
              int checker_dim)
    {
-      ///////////////////////
+
-      // Grid information
+      _isCheckerBoarded = true;
      ///////////////////////
      _checker_dim = checker_dim;
      assert(checker_dim_mask[checker_dim] == 1);
      _ndimension = dimensions.size();
--- a/lib/communicator/Communicator.h
+++ b/lib/communicator/Communicator.h
@@ -28,6 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_COMMUNICATOR_H
 #define GRID_COMMUNICATOR_H
 #include <Grid/communicator/SharedMemory.h>
 #include <Grid/communicator/Communicator_base.h>
 #endif
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -36,33 +36,9 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
 void *              CartesianCommunicator::ShmCommBuf;
 uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
 CartesianCommunicator::CommunicatorPolicy_t  
 CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 int CartesianCommunicator::nCommThreads = -1;
 int CartesianCommunicator::Hugepages = 0;
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
 void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
  heap_top  += bytes;
  heap_bytes+= bytes;
  if (heap_bytes >= MAX_MPI_SHM_BYTES) {
    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
    std::cout<< " Current value is " << (MAX_MPI_SHM_BYTES/(1024*1024)) <<std::endl;
    assert(heap_bytes<MAX_MPI_SHM_BYTES);
  }
  return ptr;
 }
 void CartesianCommunicator::ShmBufferFreeAll(void) { 
  heap_top  =(size_t)ShmBufferSelf();
  heap_bytes=0;
 }
 /////////////////////////////////
 // Grid information queries
@@ -96,281 +72,5 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
  GlobalSumVector((double *)c,2*N);
 }
 #if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3)
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  std::vector<int> row(_ndimension,1);
  assert(dim>=0 && dim<_ndimension);
  //  Split the communicator
  row[dim] = _processors[dim];
  int me;
  CartesianCommunicator Comm(row,*this,me);
  Comm.AllToAll(in,out,words,bytes);
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
  // (Turns up on 32^3 x 64 Gparity too)
  MPI_Datatype object;
  int iwords; 
  int ibytes;
  iwords = words;
  ibytes = bytes;
  assert(words == iwords); // safe to cast to int ?
  assert(bytes == ibytes); // safe to cast to int ?
  MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
  MPI_Type_commit(&object);
  MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
  MPI_Type_free(&object);
 }
 #endif
 #if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) 
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) 
 {
  _ndimension = processors.size();
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  std::vector<int> parent_processor_coor(_ndimension,0);
  std::vector<int> parent_processors    (_ndimension,1);
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // split the communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  int Nparent;
  MPI_Comm_size(parent.communicator,&Nparent);
  int childsize=1;
  for(int d=0;d<processors.size();d++) {
    childsize *= processors[d];
  }
  int Nchild = Nparent/childsize;
  assert (childsize * Nchild == Nparent);
  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
  std::vector<int> scoor(_ndimension); // coor of split within parent
  std::vector<int> ssize(_ndimension); // coor of split within parent
  std::vector<int> pcoor(_ndimension,0); 
  std::vector<int> pdims(_ndimension,1); 
  if(parent._processors.size()==4 && _ndimension==5){
      for(int i=0;i<4;i++) pcoor[i+1]=parent._processor_coor[i];
      for(int i=0;i<4;i++) pdims[i+1]=parent._processors[i];
  } else {
      assert(_ndimension == parent._ndimension);
      for(int i=0;i<_ndimension;i++) pcoor[i]=parent._processor_coor[i];
      for(int i=0;i<_ndimension;i++) pdims[i]=parent._processors[i];
  }
  for(int d=0;d<_ndimension;d++){
    ccoor[d] = pcoor[d] % processors[d];
    scoor[d] = pcoor[d] / processors[d];
    ssize[d] = pdims[d] / processors[d];
  }
  int crank;  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
  // Mpi uses the reverse Lexico convention to us
  Lexicographic::IndexFromCoorReversed(ccoor,crank,processors);
  Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);
  MPI_Comm comm_split;
  if ( Nchild > 1 ) { 
    if(0){
      std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
      std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processors[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"]    ";
      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processor_coor[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << scoor[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
      std::cout<<std::endl;
    }
    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
    assert(ierr==0);
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // Declare victory
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    //    std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
    //	      << Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
  } else {
    comm_split=parent.communicator;
    srank = 0;
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Set up from the new split communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  InitFromMPICommunicator(processors,comm_split);
  if(0){ 
    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
    for(int d=0;d<processors.size();d++){
      std::cout << d<< " " << _processor_coor[d] <<" " <<  ccoor[d]<<std::endl;
    }
  }
  for(int d=0;d<processors.size();d++){
    assert(_processor_coor[d] == ccoor[d] );
  }
 }
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 // Take an MPI_Comm and self assemble
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
 {
  _ndimension = processors.size();
  _processor_coor.resize(_ndimension);
  /////////////////////////////////
  // Count the requested nodes
  /////////////////////////////////
  _Nprocessors=1;
  _processors = processors;
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  std::vector<int> periodic(_ndimension,1);
  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
  if ( 0 && (communicator_base != communicator_world) ) {
    std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
    std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
    for(int d=0;d<_processors.size();d++){
      std::cout << _processor_coor[d]<<" ";
    }
    std::cout << std::endl;
  }
  int Size;
  MPI_Comm_size(communicator,&Size);
 #if defined(GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3)
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
 #endif
  assert(Size==_Nprocessors);
 }
 #endif
 #if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) 
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
 {
  InitFromMPICommunicator(processors,communicator_world);
 }
 #endif
 #if !defined( GRID_COMMS_MPI3) 
 int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
 int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
 #endif
 #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,
 						     void *recv,
 						     int recv_from_rank,
 						     int bytes, int dir)
 {
  std::vector<CommsRequest_t> list;
  // Discard the "dir"
  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
  // Discard the "dir"
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 #endif
 #if !defined( GRID_COMMS_MPI3) 
 void CartesianCommunicator::StencilBarrier(void){};
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
 void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
 void *CartesianCommunicator::ShmBuffer(int rank) {
  return NULL;
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
  return NULL;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
 #if 1
  int mmap_flag =0;
 #ifdef MAP_ANONYMOUS
  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
 #endif
 #ifdef MAP_ANON
  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
 #endif
 #ifdef MAP_HUGETLB
  if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 #endif
  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
  if (ShmCommBuf == (void *)MAP_FAILED) {
    perror("mmap failed ");
    exit(EXIT_FAILURE);  
  }
 #ifdef MADV_HUGEPAGE
  if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
 #endif
 #else 
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
 #endif
  bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
 }
 #endif
 }
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -32,117 +32,33 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 // Processor layout information
 ///////////////////////////////////
-#ifdef GRID_COMMS_MPI
+#include <Grid/communicator/SharedMemory.h>
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_MPIT
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
 namespace Grid {
-class CartesianCommunicator {
+class CartesianCommunicator : public SharedMemory {
 public:    
  ////////////////////////////////////////////
-  // Isend/Irecv/Wait, or Sendrecv blocking
+  // Policies
  ////////////////////////////////////////////
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  ///////////////////////////////////////////
  // Up to 65536 ranks per node adequate for now
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
  ///////////////////////////////////////////
  static const int MAXLOG2RANKSPERNODE = 16;            
  static uint64_t  MAX_MPI_SHM_BYTES;
  static int       nCommThreads;
  // use explicit huge pages
  static int       Hugepages;
  ////////////////////////////////////////////
  // Communicator should know nothing of the physics grid, only processor grid.
  ////////////////////////////////////////////
  int              _Nprocessors;     // How many in all
  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long    _ndimension;
-
+  static Grid_MPI_Comm      communicator_world;
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
+  Grid_MPI_Comm             communicator;
-  static MPI_Comm communicator_world;
+  std::vector<Grid_MPI_Comm> communicator_halo;
  MPI_Comm              communicator;
  std::vector<MPI_Comm> communicator_halo;
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
 #endif
  ////////////////////////////////////////////////////////////////////
  // Helper functionality for SHM Windows common to all other impls
  ////////////////////////////////////////////////////////////////////
  // Longer term; drop this in favour of a master / slave model with 
  // cartesian communicator on a subset of ranks, slave ranks controlled
  // by group leader with data xfer via shared memory
  ////////////////////////////////////////////////////////////////////
 #ifdef GRID_COMMS_MPI3
  static int ShmRank;
  static int ShmSize;
  static int GroupRank;
  static int GroupSize;
  static int WorldRank;
  static int WorldSize;
  std::vector<int>  WorldDims;
  std::vector<int>  GroupDims;
  std::vector<int>  ShmDims;
  std::vector<int> GroupCoor;
  std::vector<int> ShmCoor;
  std::vector<int> WorldCoor;
  static std::vector<int> GroupRanks; 
  static std::vector<int> MyGroup;
  static int ShmSetup;
  static MPI_Win ShmWindow; 
  static MPI_Comm ShmComm;
  std::vector<int>  LexicographicToWorldRank;
  static std::vector<void *> ShmCommBufs;
 #else 
  static void ShmInitGeneric(void);
  static commVector<uint8_t> ShmBufStorageVector;
 #endif 
  /////////////////////////////////
  // Grid information and queries
  // Implemented in Communicator_base.C
  /////////////////////////////////
  static void * ShmCommBuf;
  size_t heap_top;
  size_t heap_bytes;
  void *ShmBufferSelf(void);
  void *ShmBuffer(int rank);
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void ShmBufferFreeAll(void) ;
  ////////////////////////////////////////////////
  // Must call in Grid startup
@@ -158,15 +74,16 @@ class CartesianCommunicator {
  virtual ~CartesianCommunicator();
 private:
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT)  || defined (GRID_COMMS_MPI3) 
+
  ////////////////////////////////////////////////
  // Private initialise from an MPI communicator
  // Can use after an MPI_Comm_split, but hidden from user so private
  ////////////////////////////////////////////////
-  void InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base);
+  void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base);
-#endif
+
 public:
  ////////////////////////////////////////////////////////////////////////////////////////
  // Wraps MPI_Cart routines, or implements equivalent on other impls
  ////////////////////////////////////////////////////////////////////////////////////////
@@ -181,8 +98,6 @@ class CartesianCommunicator {
  const std::vector<int> & ThisProcessorCoor(void) ;
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
  int                      NodeCount(void)    ;
  int                      RankCount(void)    ;
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
@@ -270,15 +185,10 @@ class CartesianCommunicator {
  template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
    assert(dim>=0);
    assert(dim<_ndimension);
    int numnode = _processors[dim];
    //    std::cerr << " AllToAll in.size()  "<<in.size()<<std::endl;
    //    std::cerr << " AllToAll out.size() "<<out.size()<<std::endl;
    assert(in.size()==out.size());
    int numnode = _processors[dim];
    uint64_t bytes=sizeof(T);
    uint64_t words=in.size()/numnode;
    //    std:: cout << "AllToAll buffer size "<< in.size()*sizeof(T)<<std::endl;
    //    std:: cout << "AllToAll datum bytes "<< bytes<<std::endl;
    //    std:: cout << "AllToAll datum count "<< words<<std::endl;
    assert(numnode * words == in.size());
    assert(words < (1ULL<<31));
    AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -1,222 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/ActionCore.h>
 #include <mpi.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  int provided;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
    if ( provided != MPI_THREAD_MULTIPLE ) {
      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
    }
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  ShmInitGeneric();
 }
 CartesianCommunicator::~CartesianCommunicator()
 {
  int MPI_is_finalised;
  MPI_Finalized(&MPI_is_finalised);
  if (communicator && !MPI_is_finalised)
    MPI_Comm_free(&communicator);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  int myrank = _processor;
  int ierr;
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    MPI_Request xrq;
    MPI_Request rrq;
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else { 
    // Give the CPU to MPI immediately; can use threads to overlap optionally
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		      recv,bytes,MPI_CHAR,from, from,
 		      communicator,MPI_STATUS_IGNORE);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    int nreq=list.size();
    std::vector<MPI_Status> status(nreq);
    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
  ///////////////////////////////////////////////////////
  // Should only be used prior to Grid Init finished.
  // Check for this?
  ///////////////////////////////////////////////////////
 int CartesianCommunicator::RankWorld(void){ 
  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 }
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -26,580 +26,246 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
-
+#include <Grid/communicator/SharedMemory.h>
 #include <mpi.h>
 #include <semaphore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 #include <sys/types.h>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
 #include <zlib.h>
 #ifdef HAVE_NUMAIF_H
 #include <numaif.h>
 #endif
 namespace Grid {
-///////////////////////////////////////////////////////////////////////////////////////////////////
+Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmSetup = 0;
-int CartesianCommunicator::ShmRank;
+////////////////////////////////////////////
-int CartesianCommunicator::ShmSize;
+// First initialise of comms system
-int CartesianCommunicator::GroupRank;
+////////////////////////////////////////////
-int CartesianCommunicator::GroupSize;
+void CartesianCommunicator::Init(int *argc, char ***argv) 
 int CartesianCommunicator::WorldRank;
 int CartesianCommunicator::WorldSize;
 MPI_Comm CartesianCommunicator::communicator_world;
 MPI_Comm CartesianCommunicator::ShmComm;
 MPI_Win  CartesianCommunicator::ShmWindow;
 std::vector<int> CartesianCommunicator::GroupRanks;  
 std::vector<int> CartesianCommunicator::MyGroup;
 std::vector<void *> CartesianCommunicator::ShmCommBufs;
 int CartesianCommunicator::NodeCount(void)    { return GroupSize;};
 int CartesianCommunicator::RankCount(void)    { return WorldSize;};
 #undef FORCE_COMMS
 void *CartesianCommunicator::ShmBufferSelf(void)
 {
  return ShmCommBufs[ShmRank];
 }
 void *CartesianCommunicator::ShmBuffer(int rank)
 {
  int gpeer = GroupRanks[rank];
 #ifdef FORCE_COMMS
  return NULL;
 #endif
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    return ShmCommBufs[gpeer];
  }
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
 {
  static int count =0;
  int gpeer = GroupRanks[rank];
  assert(gpeer!=ShmRank); // never send to self
  assert(rank!=WorldRank);// never send to self
 #ifdef FORCE_COMMS
  return NULL;
 #endif
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
    return (void *) remote;
  }
 }
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  int provided;
  //  mtrace();
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    assert (provided == MPI_THREAD_MULTIPLE);
+    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
    if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
        (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
      assert(0);
  }
  Grid_quiesce_nodes();
  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  MPI_Comm_rank(communicator_world,&WorldRank);
  MPI_Comm_size(communicator_world,&WorldSize);
-  if ( WorldRank == 0 ) {
+  GlobalSharedMemory::Init(communicator_world);
-    std::cout << GridLogMessage<< "Initialising MPI "<< WorldRank <<"/"<<WorldSize <<std::endl;
+  GlobalSharedMemory::SharedMemoryAllocate(
 		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
 		   GlobalSharedMemory::Hugepages);
 }
-  /////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
-  // Split into groups that can share memory
+// Use cartesian communicators now even in MPI3
-  /////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
-  MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
  MPI_Comm_rank(ShmComm     ,&ShmRank);
  MPI_Comm_size(ShmComm     ,&ShmSize);
  GroupSize = WorldSize/ShmSize;
  /////////////////////////////////////////////////////////////////////
  // find world ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
  MPI_Group WorldGroup, ShmGroup;
  MPI_Comm_group (communicator_world, &WorldGroup); 
  MPI_Comm_group (ShmComm, &ShmGroup);
  std::vector<int> world_ranks(WorldSize); 
  GroupRanks.resize(WorldSize); 
  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
  ///////////////////////////////////////////////////////////////////
  // Identify who is in my group and noninate the leader
  ///////////////////////////////////////////////////////////////////
  int g=0;
  MyGroup.resize(ShmSize);
  for(int rank=0;rank<WorldSize;rank++){
    if(GroupRanks[rank]!=MPI_UNDEFINED){
      assert(g<ShmSize);
      MyGroup[g++] = rank;
    }
  }
  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
  int myleader = MyGroup[0];
  std::vector<int> leaders_1hot(WorldSize,0);
  std::vector<int> leaders_group(GroupSize,0);
  leaders_1hot [ myleader ] = 1;
  ///////////////////////////////////////////////////////////////////
  // global sum leaders over comm world
  ///////////////////////////////////////////////////////////////////
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
  assert(ierr==0);
  ///////////////////////////////////////////////////////////////////
  // find the group leaders world rank
  ///////////////////////////////////////////////////////////////////
  int group=0;
  for(int l=0;l<WorldSize;l++){
    if(leaders_1hot[l]){
      leaders_group[group++] = l;
    }
  }
  ///////////////////////////////////////////////////////////////////
  // Identify the rank of the group in which I (and my leader) live
  ///////////////////////////////////////////////////////////////////
  GroupRank=-1;
  for(int g=0;g<GroupSize;g++){
    if (myleader == leaders_group[g]){
      GroupRank=g;
    }
  }
  assert(GroupRank!=-1);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared window for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(ShmComm);
  ShmCommBuf = 0;
  ShmCommBufs.resize(ShmSize);
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hugetlbf and others map filesystems as mappable huge pages
  ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMMMAP
  char shm_name [NAME_MAX];
  for(int r=0;r<ShmSize;r++){
    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r);
    //sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r);
    //    printf("Opening file %s \n",shm_name);
    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
    if ( fd == -1) { 
      printf("open %s failed\n",shm_name);
      perror("open hugetlbfs");
      exit(0);
    }
    int mmap_flag = MAP_SHARED ;
 #ifdef MAP_POPULATE    
    mmap_flag|=MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
    if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 #endif
    void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
    if ( ptr == (void *)MAP_FAILED ) {    
      printf("mmap %s failed\n",shm_name);
      perror("failed mmap");      assert(0);    
    }
    assert(((uint64_t)ptr&0x3F)==0);
    ShmCommBufs[r] =ptr;
  }
 #endif
  ////////////////////////////////////////////////////////////////////////////////////////////
  // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
  // tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
  // the posix shm virtual file system
  ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMOPEN
  char shm_name [NAME_MAX];
  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
      shm_unlink(shm_name);
      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
      ftruncate(fd, size);
      int mmap_flag = MAP_SHARED;
 #ifdef MAP_POPULATE 
      mmap_flag |= MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
      if (Hugepages) mmap_flag |= MAP_HUGETLB;
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);
 // Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
 #if 0
 //#ifdef HAVE_NUMAIF_H
 	int status;
 	int flags=MPOL_MF_MOVE;
 #ifdef KNL
 	int nodes=1; // numa domain == MCDRAM
 	// Find out if in SNC2,SNC4 mode ?
 #else
 	int nodes=r; // numa domain == MPI ID
 #endif
 	unsigned long count=1;
 	for(uint64_t page=0;page<size;page+=4096){
 	  void *pages = (void *) ( page + (uint64_t)ptr );
 	  uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
 	  ierr= move_pages(0,count, &pages,&nodes,&status,flags);
 	  if (ierr && (page==0)) perror("numa relocate command failed");
 	}
 #endif
 	ShmCommBufs[r] =ptr;
    }
  }
  MPI_Barrier(ShmComm);
  if ( ShmRank != 0 ) { 
    for(int r=0;r<ShmSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
      int fd=shm_open(shm_name,O_RDWR,0666);
      if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);
      ShmCommBufs[r] =ptr;
    }
  }
 #endif
  ////////////////////////////////////////////////////////////////////////////////////////////
  // SHMGET SHMAT and SHM_HUGETLB flag
  ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMGET
  std::vector<int> shmids(ShmSize);
  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
      key_t key   = IPC_PRIVATE;
      int flags = IPC_CREAT | SHM_R | SHM_W;
 #ifdef SHM_HUGETLB
      if (Hugepages) flags|=SHM_HUGETLB;
 #endif
      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
 	int errsv = errno;
 	printf("Errno %d\n",errsv);
 	printf("key   %d\n",key);
 	printf("size  %lld\n",size);
 	printf("flags %d\n",flags);
 	perror("shmget");
 	exit(1);
      } else { 
 	printf("shmid: 0x%x\n", shmids[r]);
      }
    }
  }
  MPI_Barrier(ShmComm);
  MPI_Bcast(&shmids[0],ShmSize*sizeof(int),MPI_BYTE,0,ShmComm);
  MPI_Barrier(ShmComm);
  for(int r=0;r<ShmSize;r++){
    ShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
    if (ShmCommBufs[r] == (uint64_t *)-1) {
      perror("Shared memory attach failure");
      shmctl(shmids[r], IPC_RMID, NULL);
      exit(2);
    }
    printf("shmaddr: %p\n", ShmCommBufs[r]);
  }
  MPI_Barrier(ShmComm);
  // Mark for clean up
  for(int r=0;r<ShmSize;r++){
    shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
  }
  MPI_Barrier(ShmComm);
 #endif
  ShmCommBuf         = ShmCommBufs[ShmRank];
  MPI_Barrier(ShmComm);
  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
      uint64_t * check = (uint64_t *) ShmCommBufs[r];
      check[0] = GroupRank;
      check[1] = r;
      check[2] = 0x5A5A5A;
    }
  }
  MPI_Barrier(ShmComm);
  for(int r=0;r<ShmSize;r++){
    uint64_t * check = (uint64_t *) ShmCommBufs[r];
    assert(check[0]==GroupRank);
    assert(check[1]==r);
    assert(check[2]==0x5A5A5A);
  }
  MPI_Barrier(ShmComm);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Verbose for now
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  if (WorldRank == 0){
    std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
    std::cout<< WorldSize << " Ranks " ;
    std::cout<< GroupSize << " Nodes " ;
    std::cout<< " with "<< ShmSize  << " ranks-per-node "<<std::endl;
    std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size ";
    std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
    for(int g=0;g<GroupSize;g++){
      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
    }
    std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
    for(int g=0;g<ShmSize;g++){
      std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
      if(g!=ShmSize-1) std::cout<<",";
      else std::cout<<"}"<<std::endl;
    }
  }
  for(int g=0;g<GroupSize;g++){
    if ( (ShmRank == 0) && (GroupRank==g) )  std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
    for(int r=0;r<ShmSize;r++){
      if ( (ShmRank == 0) && (GroupRank==g) ) {
 	std::cout<<MyGroup[r];
 	if(r<ShmSize-1) std::cout<<",";
 	else std::cout<<"}"<<std::endl<<std::flush;
      }
      MPI_Barrier(communicator_world);
    }
  }
  assert(ShmSetup==0);  ShmSetup=1;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Want to implement some magic ... Group sub-cubes into those on same node
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &dest,int &source)
 {
-  std::vector<int> coor = _processor_coor; // my coord
+  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
-  assert(std::abs(shift) <_processors[dim]);
+  assert(ierr==0);
-
+}
  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,source,_processors);
  source = LexicographicToWorldRank[source];
  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,dest,_processors);
  dest = LexicographicToWorldRank[dest];
 }// rank is world rank.
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
-  Lexicographic::IndexFromCoor(coor,rank,_processors);
+  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
-  rank = LexicographicToWorldRank[rank];
+  assert(ierr==0);
  return rank;
-}// rank is world rank
+}
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
-  int lr=-1;
+  coor.resize(_ndimension);
-  for(int r=0;r<WorldSize;r++){// map world Rank to lexico and then to coor
+  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
-    if( LexicographicToWorldRank[r]==rank) lr = r;
+  assert(ierr==0);
 }
-  assert(lr!=-1);
+
-  Lexicographic::CoorFromIndex(coor,lr,_processors);
+////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Initialises from communicator_world
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
 {
  MPI_Comm optimal_comm;
  ////////////////////////////////////////////////////
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
  // Free the temp communicator
  ///////////////////////////////////////////////////
  MPI_Comm_free(&optimal_comm);
 }
 //////////////////////////////////
 // Try to subdivide communicator
 //////////////////////////////////
 /*
 * Use default in MPI compile
 */
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)    
  : CartesianCommunicator(processors) 
 {
-  std::cout << "Attempts to split MPI3 communicators will fail until implemented" <<std::endl;
+  _ndimension = processors.size();
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  std::vector<int> parent_processor_coor(_ndimension,0);
  std::vector<int> parent_processors    (_ndimension,1);
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
  }
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
-{ 
+  // split the communicator
-  int ierr;
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  communicator=communicator_world;
+  //  int Nparent = parent._processors ; 
  //  std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
  int Nparent;
  MPI_Comm_size(parent.communicator,&Nparent);
  //  std::cout << " Parent size  "<<Nparent <<std::endl;
  int childsize=1;
  for(int d=0;d<processors.size();d++) {
    childsize *= processors[d];
  }
  int Nchild = Nparent/childsize;
  assert (childsize * Nchild == Nparent);
  //  std::cout << " child size  "<<childsize <<std::endl;
  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
  std::vector<int> scoor(_ndimension); // coor of split within parent
  std::vector<int> ssize(_ndimension); // coor of split within parent
  for(int d=0;d<_ndimension;d++){
    ccoor[d] = parent_processor_coor[d] % processors[d];
    scoor[d] = parent_processor_coor[d] / processors[d];
    ssize[d] = parent_processors[d]     / processors[d];
  }
  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
  int crank;  
  // Mpi uses the reverse Lexico convention to us; so reversed routines called
  Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
  Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);      // ssize is the number of split grids
  MPI_Comm comm_split;
  if ( Nchild > 1 ) { 
    if(0){
      std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
      std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processors[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"]    ";
      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processor_coor[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << scoor[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
      std::cout<<std::endl;
      //////////////////////////////////////////////////////////////////////////////////////////////////////
      // Declare victory
      //////////////////////////////////////////////////////////////////////////////////////////////////////
      std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
 		<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
      std::cout << " Split communicator " <<comm_split <<std::endl;
    }
    ////////////////////////////////////////////////////////////////
    // Split the communicator
    ////////////////////////////////////////////////////////////////
    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
    assert(ierr==0);
  } else {
    srank = 0;
    int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
    assert(ierr==0);
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Set up from the new split communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  InitFromMPICommunicator(processors,comm_split);
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take the right SHM buffers
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  SetCommunicator(comm_split);
  ///////////////////////////////////////////////
  // Free the temp communicator 
  ///////////////////////////////////////////////
  MPI_Comm_free(&comm_split);
  if(0){ 
    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
    for(int d=0;d<processors.size();d++){
      std::cout << d<< " " << _processor_coor[d] <<" " <<  ccoor[d]<<std::endl;
    }
  }
  for(int d=0;d<processors.size();d++){
    assert(_processor_coor[d] == ccoor[d] );
  }
 }
 void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
 {
  ////////////////////////////////////////////////////
  // Creates communicator, and the communicator_halo
  ////////////////////////////////////////////////////
  _ndimension = processors.size();
  _processor_coor.resize(_ndimension);
  /////////////////////////////////
  // Count the requested nodes
  /////////////////////////////////
  _Nprocessors=1;
  _processors = processors;
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  std::vector<int> periodic(_ndimension,1);
  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
  if ( 0 && (communicator_base != communicator_world) ) {
    std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
    std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
    for(int d=0;d<_processors.size();d++){
      std::cout << _processor_coor[d]<<" ";
    }
    std::cout << std::endl;
  }
  int Size;
  MPI_Comm_size(communicator,&Size);
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
-
+  assert(Size==_Nprocessors);
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
  int log2size = -1;
  for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){  
    if ( (0x1<<i) == ShmSize ) {
      log2size = i;
      break;
    }
  }
  assert(log2size != -1);
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  std::vector<int> WorldDims = processors;
  ShmDims.resize  (_ndimension,1);
  GroupDims.resize(_ndimension);
  ShmCoor.resize  (_ndimension);
  GroupCoor.resize(_ndimension);
  WorldCoor.resize(_ndimension);
  int dim = 0;
  for(int l2=0;l2<log2size;l2++){
    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%_ndimension;
    ShmDims[dim]*=2;
    dim=(dim+1)%_ndimension;
 }
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
  for(int d=0;d<_ndimension;d++){
    GroupDims[d] = WorldDims[d]/ShmDims[d];
  }
  ////////////////////////////////////////////////////////////////
  // Verbose
  ////////////////////////////////////////////////////////////////
 #if 0
  std::cout<< GridLogMessage << "MPI-3 usage "<<std::endl;
  std::cout<< GridLogMessage << "SHM   ";
  for(int d=0;d<_ndimension;d++){
    std::cout<< ShmDims[d] <<" ";
  }
  std::cout<< std::endl;
  std::cout<< GridLogMessage << "Group ";
  for(int d=0;d<_ndimension;d++){
    std::cout<< GroupDims[d] <<" ";
  }
  std::cout<< std::endl;
  std::cout<< GridLogMessage<<"World ";
  for(int d=0;d<_ndimension;d++){
    std::cout<< WorldDims[d] <<" ";
  }
  std::cout<< std::endl;
 #endif
  ////////////////////////////////////////////////////////////////
  // Check processor counts match
  ////////////////////////////////////////////////////////////////
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  assert(WorldSize==_Nprocessors);
  ////////////////////////////////////////////////////////////////
  // Establish mapping between lexico physics coord and WorldRank
  ////////////////////////////////////////////////////////////////
  Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
  Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
  for(int d=0;d<_ndimension;d++){
    WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
  }
  _processor_coor = WorldCoor;
  _processor      = WorldRank;
  ///////////////////////////////////////////////////////////////////
  // global sum Lexico to World mapping
  ///////////////////////////////////////////////////////////////////
  int lexico;
  LexicographicToWorldRank.resize(WorldSize,0);
  Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
  LexicographicToWorldRank[lexico] = WorldRank;
  ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
  assert(ierr==0);
  for(int i=0;i<WorldSize;i++){
    int wr = LexicographicToWorldRank[i];
    //    int wr = i;
    std::vector<int> coor(_ndimension);
    ProcessorCoorFromRank(wr,coor); // from world rank
    int ck = RankFromProcessorCoor(coor);
    assert(ck==wr);
    if ( wr == WorldRank ) { 
      for(int j=0;j<coor.size();j++) {
 	assert(coor[j] == _processor_coor[j]);
      }
    }
    /*
    std::cout << GridLogMessage<< " Lexicographic "<<i;
    std::cout << " MPI rank      "<<wr;
    std::cout << " Coor          ";
    for(int j=0;j<coor.size();j++) std::cout << coor[j];
    std::cout<< std::endl;
    */
    /////////////////////////////////////////////////////
    // Check everyone agrees on everyone elses coords
    /////////////////////////////////////////////////////
    std::vector<int> mcoor = coor;
    this->Broadcast(0,(void *)&mcoor[0],mcoor.size()*sizeof(int));
    for(int d = 0 ; d< _ndimension; d++) {
      assert(coor[d] == mcoor[d]);
    }
  }
 };
 CartesianCommunicator::~CartesianCommunicator()
 {
  int MPI_is_finalised;
@@ -734,19 +400,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  MPI_Request rrq;
  int ierr;
-  int gdest = GroupRanks[dest];
+  int gdest = ShmRanks[dest];
-  int gfrom = GroupRanks[from];
+  int gfrom = ShmRanks[from];
-  int gme   = GroupRanks[_processor];
+  int gme   = ShmRanks[_processor];
  assert(dest != _processor);
  assert(from != _processor);
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;
 #ifdef FORCE_COMMS
  gdest = MPI_UNDEFINED;
  gfrom = MPI_UNDEFINED;
 #endif
  if ( gfrom ==MPI_UNDEFINED) {
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
    assert(ierr==0);
@@ -815,5 +477,38 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
  assert(ierr==0);
 }
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  std::vector<int> row(_ndimension,1);
  assert(dim>=0 && dim<_ndimension);
  //  Split the communicator
  row[dim] = _processors[dim];
  int me;
  CartesianCommunicator Comm(row,*this,me);
  Comm.AllToAll(in,out,words,bytes);
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
  // (Turns up on 32^3 x 64 Gparity too)
  MPI_Datatype object;
  int iwords; 
  int ibytes;
  iwords = words;
  ibytes = bytes;
  assert(words == iwords); // safe to cast to int ?
  assert(bytes == ibytes); // safe to cast to int ?
  MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
  MPI_Type_commit(&object);
  MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
  MPI_Type_free(&object);
 }
 }
--- a/lib/communicator/Communicator_mpi3_leader.cc
+++ b/lib/communicator/Communicator_mpi3_leader.cc
@@ -1,988 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 #include <mpi.h>
 //#include <numaif.h>
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Workarounds:
 /// i) bloody mac os doesn't implement unnamed semaphores since it is "optional" posix.
 ///    darwin dispatch semaphores don't seem to be multiprocess.
 ///
 /// ii) openmpi under --mca shmem posix works with two squadrons per node; 
 ///     openmpi under default mca settings (I think --mca shmem mmap) on MacOS makes two squadrons map the SAME
 ///     memory as each other, despite their living on different communicators. This appears to be a bug in OpenMPI.
 ///
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 #include <semaphore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 typedef sem_t *Grid_semaphore;
 #error  /*THis is deprecated*/
 #if 0 
 #define SEM_INIT(S)      S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
 #define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
 #define SEM_POST(S) assert ( sem_post(S) == 0 ); 
 #define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
 #else
 #define SEM_INIT(S)      ;
 #define SEM_INIT_EXCL(S) ;
 #define SEM_POST(S) ;
 #define SEM_WAIT(S) ;
 #endif
 #include <sys/mman.h>
 namespace Grid {
 enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL, COMMAND_SENDRECV };
 struct Descriptor {
  uint64_t buf;
  size_t bytes;
  int rank;
  int tag;
  int command;
  uint64_t xbuf;
  uint64_t rbuf;
  int xtag;
  int rtag;
  int src;
  int dest;
  MPI_Request request;
 };
 const int pool = 48;
 class SlaveState {
 public:
  volatile int head;
  volatile int start;
  volatile int tail;
  volatile Descriptor Descrs[pool];
 };
 class Slave {
 public:
  Grid_semaphore  sem_head;
  Grid_semaphore  sem_tail;
  SlaveState *state;
  MPI_Comm squadron;
  uint64_t     base;
  int universe_rank;
  int vertical_rank;
  char sem_name [NAME_MAX];
  ////////////////////////////////////////////////////////////
  // Descriptor circular pointers
  ////////////////////////////////////////////////////////////
  Slave() {};
  void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);
  void SemInit(void) {
    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
    SEM_INIT(sem_head);
    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
    SEM_INIT(sem_tail);
  }  
  void SemInitExcl(void) {
    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
    SEM_INIT_EXCL(sem_head);
    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
    SEM_INIT_EXCL(sem_tail);
  }  
  void WakeUpDMA(void) { 
    SEM_POST(sem_head);
  };
  void WakeUpCompute(void) { 
    SEM_POST(sem_tail);
  };
  void WaitForCommand(void) { 
    SEM_WAIT(sem_head);
  };
  void WaitForComplete(void) { 
    SEM_WAIT(sem_tail);
  };
  void EventLoop (void) {
    //    std::cout<< " Entering event loop "<<std::endl;
    while(1){
      WaitForCommand();
      //      std::cout << "Getting command "<<std::endl;
 #if 0
      _mm_monitor((void *)&state->head,0,0);
      int s=state->start;
      if ( s != state->head ) {
 	_mm_mwait(0,0);
      }
 #endif
      Event();
    }
  }
  int Event (void) ;
  uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
  void QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) ;
  void WaitAll() {
    //    std::cout << "Queueing WAIT command  "<<std::endl;
    QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
    //    std::cout << "Waking up DMA "<<std::endl;
    WakeUpDMA();
    //    std::cout << "Waiting from semaphore "<<std::endl;
    WaitForComplete();
    //    std::cout << "Checking FIFO is empty "<<std::endl;
    while ( state->tail != state->head );
  }
 };
 ////////////////////////////////////////////////////////////////////////
 // One instance of a data mover.
 // Master and Slave must agree on location in shared memory
 ////////////////////////////////////////////////////////////////////////
 class MPIoffloadEngine { 
 public:
  static std::vector<Slave> Slaves;
  static int ShmSetup;
  static int UniverseRank;
  static int UniverseSize;
  static MPI_Comm communicator_universe;
  static MPI_Comm communicator_cached;
  static MPI_Comm HorizontalComm;
  static int HorizontalRank;
  static int HorizontalSize;
  static MPI_Comm VerticalComm;
  static MPI_Win  VerticalWindow; 
  static int VerticalSize;
  static int VerticalRank;
  static std::vector<void *> VerticalShmBufs;
  static std::vector<std::vector<int> > UniverseRanks;
  static std::vector<int> UserCommunicatorToWorldRanks; 
  static MPI_Group WorldGroup, CachedGroup;
  static void CommunicatorInit (MPI_Comm &communicator_world,
 				MPI_Comm &ShmComm,
 				void * &ShmCommBuf);
  static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank);
  /////////////////////////////////////////////////////////
  // routines for master proc must handle any communicator
  /////////////////////////////////////////////////////////
  static void QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
     //    std::cout<< " Queueing send  "<< bytes<< " slave "<< slave << " to comm "<<rank  <<std::endl;
    Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
    //    std::cout << "Queued send command to rank "<< rank<< " via "<<slave <<std::endl;
    Slaves[slave].WakeUpDMA();
    //    std::cout << "Waking up DMA "<< slave<<std::endl;
  };
  static void QueueSendRecv(int slave,void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) 
  {
    Slaves[slave].QueueSendRecv(xbuf,rbuf,bytes,xtag,rtag,comm,dest,src);
    Slaves[slave].WakeUpDMA();
  }
  static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
    //    std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank  <<std::endl;
    Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
    //    std::cout << "Queued recv command from rank "<< rank<< " via "<<slave <<std::endl;
    Slaves[slave].WakeUpDMA();
    //    std::cout << "Waking up DMA "<< slave<<std::endl;
  };
  static void WaitAll() {
    for(int s=1;s<VerticalSize;s++) {
      //      std::cout << "Waiting for slave "<< s<<std::endl;
      Slaves[s].WaitAll();
    }
    //    std::cout << " Wait all Complete "<<std::endl;
  };
  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
    int basework = nwork/units;
    int backfill = units-(nwork%units);
    if ( me >= units ) { 
      mywork = myoff = 0;
    } else { 
      mywork = (nwork+me)/units;
      myoff  = basework * me;
      if ( me > backfill ) 
 	myoff+= (me-backfill);
    }
    return;
  };
  static void QueueRoundRobinSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
    uint8_t * cxbuf = (uint8_t *) xbuf;
    uint8_t * crbuf = (uint8_t *) rbuf;
    static int rrp=0;
    int procs = VerticalSize-1;
    int myoff=0;
    int mywork=bytes;
    QueueSendRecv(rrp+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
    rrp = rrp+1;
    if ( rrp == (VerticalSize-1) ) rrp = 0;
  }
  static void QueueMultiplexedSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
    uint8_t * cxbuf = (uint8_t *) xbuf;
    uint8_t * crbuf = (uint8_t *) rbuf;
    int mywork, myoff, procs;
    procs = VerticalSize-1;
    for(int s=0;s<procs;s++) {
      GetWork(bytes,s,mywork,myoff,procs);
      QueueSendRecv(s+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
    }
  };
  static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
    uint8_t * cbuf = (uint8_t *) buf;
    int mywork, myoff, procs;
    procs = VerticalSize-1;
    for(int s=0;s<procs;s++) {
      GetWork(bytes,s,mywork,myoff,procs);
      QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
    }
  };
  static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
    uint8_t * cbuf = (uint8_t *) buf;
    int mywork, myoff, procs;
    procs = VerticalSize-1;
    for(int s=0;s<procs;s++) {
      GetWork(bytes,s,mywork,myoff,procs);
      QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
    }
  };
 };
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 std::vector<Slave> MPIoffloadEngine::Slaves;
 int MPIoffloadEngine::UniverseRank;
 int MPIoffloadEngine::UniverseSize;
 MPI_Comm  MPIoffloadEngine::communicator_universe;
 MPI_Comm  MPIoffloadEngine::communicator_cached;
 MPI_Group MPIoffloadEngine::WorldGroup;
 MPI_Group MPIoffloadEngine::CachedGroup;
 MPI_Comm MPIoffloadEngine::HorizontalComm;
 int      MPIoffloadEngine::HorizontalRank;
 int      MPIoffloadEngine::HorizontalSize;
 MPI_Comm MPIoffloadEngine::VerticalComm;
 int      MPIoffloadEngine::VerticalSize;
 int      MPIoffloadEngine::VerticalRank;
 MPI_Win  MPIoffloadEngine::VerticalWindow; 
 std::vector<void *>            MPIoffloadEngine::VerticalShmBufs;
 std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
 std::vector<int>               MPIoffloadEngine::UserCommunicatorToWorldRanks; 
 int CartesianCommunicator::NodeCount(void)    { return HorizontalSize;};
 int MPIoffloadEngine::ShmSetup = 0;
 void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
 					 MPI_Comm &ShmComm,
 					 void * &ShmCommBuf)
 {      
  int flag;
  assert(ShmSetup==0);  
  //////////////////////////////////////////////////////////////////////
  // Universe is all nodes prior to squadron grouping
  //////////////////////////////////////////////////////////////////////
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_universe);
  MPI_Comm_rank(communicator_universe,&UniverseRank);
  MPI_Comm_size(communicator_universe,&UniverseSize);
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory (Verticals)
  /////////////////////////////////////////////////////////////////////
 #undef MPI_SHARED_MEM_DEBUG
 #ifdef  MPI_SHARED_MEM_DEBUG
  MPI_Comm_split(communicator_universe,(UniverseRank/4),UniverseRank,&VerticalComm);
 #else 
  MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
 #endif
  MPI_Comm_rank(VerticalComm     ,&VerticalRank);
  MPI_Comm_size(VerticalComm     ,&VerticalSize);
  //////////////////////////////////////////////////////////////////////
  // Split into horizontal groups by rank in squadron
  //////////////////////////////////////////////////////////////////////
  MPI_Comm_split(communicator_universe,VerticalRank,UniverseRank,&HorizontalComm);
  MPI_Comm_rank(HorizontalComm,&HorizontalRank);
  MPI_Comm_size(HorizontalComm,&HorizontalSize);
  assert(HorizontalSize*VerticalSize==UniverseSize);
  ////////////////////////////////////////////////////////////////////////////////
  // What is my place in the world
  ////////////////////////////////////////////////////////////////////////////////
  int WorldRank=0;
  if(VerticalRank==0) WorldRank = HorizontalRank;
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&WorldRank,1,MPI_INT,MPI_SUM,VerticalComm);
  assert(ierr==0);
  ////////////////////////////////////////////////////////////////////////////////
  // Where is the world in the universe?
  ////////////////////////////////////////////////////////////////////////////////
  UniverseRanks = std::vector<std::vector<int> >(HorizontalSize,std::vector<int>(VerticalSize,0));
  UniverseRanks[WorldRank][VerticalRank] = UniverseRank;
  for(int w=0;w<HorizontalSize;w++){
    ierr=MPI_Allreduce(MPI_IN_PLACE,&UniverseRanks[w][0],VerticalSize,MPI_INT,MPI_SUM,communicator_universe);
    assert(ierr==0);
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared window for our group, pass back Shm info to CartesianCommunicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  VerticalShmBufs.resize(VerticalSize);
 #undef MPI_SHARED_MEM
 #ifdef MPI_SHARED_MEM
  ierr = MPI_Win_allocate_shared(CartesianCommunicator::MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,VerticalComm,&ShmCommBuf,&VerticalWindow);
  ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
  assert(ierr==0);
  //  std::cout<<"SHM "<<ShmCommBuf<<std::endl;
  for(int r=0;r<VerticalSize;r++){
    MPI_Aint sz;
    int dsp_unit;
    MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
    //    std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
  }
 #else 
  char shm_name [NAME_MAX];
  MPI_Barrier(VerticalComm);
  if ( VerticalRank == 0 ) {
    for(int r=0;r<VerticalSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
      if ( r>0 ) size = sizeof(SlaveState);
      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
      shm_unlink(shm_name);
      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
      if ( fd < 0 ) {
 	perror("failed shm_open");
 	assert(0);
      }
      ftruncate(fd, size);
      VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      if ( VerticalShmBufs[r] == MAP_FAILED ) { 
 	perror("failed mmap");
 	assert(0);
      }
      /*
      for(uint64_t page=0;page<size;page+=4096){
 	void *pages = (void *) ( page + (uint64_t)VerticalShmBufs[r] );
 	int status;
 	int flags=MPOL_MF_MOVE_ALL;
 	int nodes=1; // numa domain == MCDRAM
 	unsigned long count=1;
 	ierr= move_pages(0,count, &pages,&nodes,&status,flags);
 	if (ierr && (page==0)) perror("numa relocate command failed");
      }
      */
      uint64_t * check = (uint64_t *) VerticalShmBufs[r];
      check[0] = WorldRank;
      check[1] = r;
      //      std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
    }
  }
  MPI_Barrier(VerticalComm);
  if ( VerticalRank != 0 ) { 
  for(int r=0;r<VerticalSize;r++){
    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
    if ( r>0 ) size = sizeof(SlaveState);
    sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
    int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
    if ( fd<0 ) {
      perror("failed shm_open");
      assert(0);
    }
    VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    uint64_t * check = (uint64_t *) VerticalShmBufs[r];
    assert(check[0]== WorldRank);
    assert(check[1]== r);
    //    std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
  }
  }
 #endif
  MPI_Barrier(VerticalComm);
  //////////////////////////////////////////////////////////////////////
  // Map rank of leader on node in their in new world, to the
  // rank in this vertical plane's horizontal communicator
  //////////////////////////////////////////////////////////////////////
  communicator_world = HorizontalComm;
  ShmComm            = VerticalComm;
  ShmCommBuf         = VerticalShmBufs[0];
  MPI_Comm_group (communicator_world, &WorldGroup); 
  ///////////////////////////////////////////////////////////
  // Start the slave data movers
  ///////////////////////////////////////////////////////////
  if ( VerticalRank != 0 ) {
    Slave indentured;
    indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm, UniverseRank,VerticalRank);
    indentured.SemInitExcl();// init semaphore in shared memory
    MPI_Barrier(VerticalComm);
    MPI_Barrier(VerticalComm);
    indentured.EventLoop();
    assert(0);
  } else {
    Slaves.resize(VerticalSize);
    for(int i=1;i<VerticalSize;i++){
      Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm, UniverseRanks[HorizontalRank][i],i);
    }
    MPI_Barrier(VerticalComm);
    for(int i=1;i<VerticalSize;i++){
      Slaves[i].SemInit();// init semaphore in shared memory
    }
    MPI_Barrier(VerticalComm);
  }
  ///////////////////////////////////////////////////////////
  // Verbose for now
  ///////////////////////////////////////////////////////////
  ShmSetup=1;
  if (UniverseRank == 0){
    std::cout<<GridLogMessage << "Grid MPI-3 configuration: detected ";
    std::cout<<UniverseSize   << " Ranks " ;
    std::cout<<HorizontalSize << " Nodes " ;
    std::cout<<VerticalSize   << " with ranks-per-node "<<std::endl;
    std::cout<<GridLogMessage << "Grid MPI-3 configuration: using one lead process per node " << std::endl;
    std::cout<<GridLogMessage << "Grid MPI-3 configuration: reduced communicator has size " << HorizontalSize << std::endl;
    for(int g=0;g<HorizontalSize;g++){
      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<< UniverseRanks[g][0]<<std::endl;
    }
    for(int g=0;g<HorizontalSize;g++){
      std::cout<<GridLogMessage<<" { ";
      for(int s=0;s<VerticalSize;s++){
 	std::cout<< UniverseRanks[g][s];
 	if ( s<VerticalSize-1 ) {
 	  std::cout<<",";
 	}
      }
      std::cout<<" } "<<std::endl;
    }
  }
 };
  ///////////////////////////////////////////////////////////////////////////////////////////////
  // Map the communicator into communicator_world, and find the neighbour.
  // Cache the mappings; cache size is 1.
  ///////////////////////////////////////////////////////////////////////////////////////////////
 void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank) {
  if ( comm == HorizontalComm ) {
    comm_world_peer = rank;
    //    std::cout << " MapCommRankToWorldRank  horiz " <<rank<<"->"<<comm_world_peer<<std::endl;
  } else if ( comm == communicator_cached ) {
    comm_world_peer = UserCommunicatorToWorldRanks[rank];
    //    std::cout << " MapCommRankToWorldRank  cached " <<rank<<"->"<<comm_world_peer<<std::endl;
  } else { 
    int size;
    MPI_Comm_size(comm,&size);
    UserCommunicatorToWorldRanks.resize(size);
    std::vector<int> cached_ranks(size); 
    for(int r=0;r<size;r++) {
      cached_ranks[r]=r;
    }
    communicator_cached=comm;
    MPI_Comm_group(communicator_cached, &CachedGroup);
    MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]); 
    comm_world_peer = UserCommunicatorToWorldRanks[rank];
    //    std::cout << " MapCommRankToWorldRank  cache miss " <<rank<<"->"<<comm_world_peer<<std::endl;
    assert(comm_world_peer != MPI_UNDEFINED);
  }
  assert( (tag & (~0xFFFFL)) ==0); 
  uint64_t icomm = (uint64_t)comm;
  int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
                ^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
  //  hashtag = (comm_hash<<15) | tag;      
  hashtag = tag;      
 };
 void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank)
 {
  squadron=_squadron;
  universe_rank=_universe_rank;
  vertical_rank=_vertical_rank;
  state   =_state;
  //  std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<<universe_rank <<std::endl;
  state->head = state->tail = state->start = 0;
  base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
  int rank; MPI_Comm_rank(_squadron,&rank);
 }
 #define PERI_PLUS(A) ( (A+1)%pool )
 int Slave::Event (void) {
  static int tail_last;
  static int head_last;
  static int start_last;
  int ierr;
  MPI_Status stat;
  static int i=0;
  ////////////////////////////////////////////////////
  // Try to advance the start pointers
  ////////////////////////////////////////////////////
  int s=state->start;
  if ( s != state->head ) {
    switch ( state->Descrs[s].command ) {
    case COMMAND_ISEND:
      ierr = MPI_Isend((void *)(state->Descrs[s].buf+base), 
 		       state->Descrs[s].bytes, 
 		       MPI_CHAR,
 		       state->Descrs[s].rank,
 		       state->Descrs[s].tag,
 		       MPIoffloadEngine::communicator_universe,
 		       (MPI_Request *)&state->Descrs[s].request);
      assert(ierr==0);
      state->start = PERI_PLUS(s);
      return 1;
      break;
    case COMMAND_IRECV:
      ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base), 
 		     state->Descrs[s].bytes, 
 		     MPI_CHAR,
 		     state->Descrs[s].rank,
 		     state->Descrs[s].tag,
 		     MPIoffloadEngine::communicator_universe,
 		     (MPI_Request *)&state->Descrs[s].request);
      //      std::cout<< " Request is "<<state->Descrs[s].request<<std::endl;
      //      std::cout<< " Request0 is "<<state->Descrs[0].request<<std::endl;
      assert(ierr==0);
      state->start = PERI_PLUS(s);
      return 1;
      break;
    case COMMAND_SENDRECV:
      //      fprintf(stderr,"Sendrecv ->%d %d : <-%d %d \n",state->Descrs[s].dest, state->Descrs[s].xtag+i*10,state->Descrs[s].src, state->Descrs[s].rtag+i*10);
      ierr=MPI_Sendrecv((void *)(state->Descrs[s].xbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].dest, state->Descrs[s].xtag+i*10,
 			(void *)(state->Descrs[s].rbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].src , state->Descrs[s].rtag+i*10,
 			MPIoffloadEngine::communicator_universe,MPI_STATUS_IGNORE);
      assert(ierr==0);
      //      fprintf(stderr,"Sendrecv done %d %d\n",ierr,i);
      //      MPI_Barrier(MPIoffloadEngine::HorizontalComm);
      //      fprintf(stderr,"Barrier\n");
      i++;
      state->start = PERI_PLUS(s);
      return 1;
      break;
    case COMMAND_WAITALL:
      for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
 	if ( state->Descrs[t].command != COMMAND_SENDRECV ) {
 	  MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
 	}
      };
      s=PERI_PLUS(s);
      state->start = s;
      state->tail  = s;
      WakeUpCompute();
      return 1;
      break;
    default:
      assert(0);
      break;
    }
  }
  return 0;
 }
  //////////////////////////////////////////////////////////////////////////////
  // External interaction with the queue
  //////////////////////////////////////////////////////////////////////////////
 void Slave::QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) 
 {
  int head =state->head;
  int next = PERI_PLUS(head);
  // Set up descriptor
  int worldrank;
  int hashtag;
  MPI_Comm    communicator;
  MPI_Request request;
  uint64_t relative;
  relative = (uint64_t)xbuf - base;
  state->Descrs[head].xbuf    = relative;
  relative= (uint64_t)rbuf - base;
  state->Descrs[head].rbuf    = relative;
  state->Descrs[head].bytes  = bytes;
  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,xtag,comm,dest);
  state->Descrs[head].dest   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
  state->Descrs[head].xtag    = hashtag;
  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,rtag,comm,src);
  state->Descrs[head].src    = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
  state->Descrs[head].rtag    = hashtag;
  state->Descrs[head].command= COMMAND_SENDRECV;
  // Block until FIFO has space
  while( state->tail==next );
  // Msync on weak order architectures
  // Advance pointer
  state->head = next;
 };
 uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank) 
 {
  /////////////////////////////////////////
  // Spin; if FIFO is full until not full
  /////////////////////////////////////////
  int head =state->head;
  int next = PERI_PLUS(head);
  // Set up descriptor
  int worldrank;
  int hashtag;
  MPI_Comm    communicator;
  MPI_Request request;
  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank);
  uint64_t relative= (uint64_t)buf - base;
  state->Descrs[head].buf    = relative;
  state->Descrs[head].bytes  = bytes;
  state->Descrs[head].rank   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
  state->Descrs[head].tag    = hashtag;
  state->Descrs[head].command= command;
  /*  
  if ( command == COMMAND_ISEND ) { 
  std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank 
            << " to worldrank " << worldrank <<std::endl;
  std::cout << " via VerticalRank "<< vertical_rank <<" to universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
  std::cout << " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
  } 
  if ( command == COMMAND_IRECV ) { 
  std::cout << "QueueRecv on "<< universe_rank <<" from commrank " << commrank 
            << " from worldrank " << worldrank <<std::endl;
  std::cout << " via VerticalRank "<< vertical_rank <<" from universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
  std::cout << " QueueSend "<<buf<<"["<<bytes<<"]" << std::endl;
  } 
  */
  // Block until FIFO has space
  while( state->tail==next );
  // Msync on weak order architectures
  // Advance pointer
  state->head = next;
  return 0;
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 void CartesianCommunicator::Init(int *argc, char ***argv) 
 {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
  communicator_world = MPI_COMM_WORLD;
  MPI_Comm ShmComm;
  MPIoffloadEngine::CommunicatorInit (communicator_world,ShmComm,ShmCommBuf);
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  _ndimension = processors.size();
  std::vector<int> periodic(_ndimension,1);
  _Nprocessors=1;
  _processors = processors;
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  int Size; 
  MPI_Comm_size(communicator_world,&Size);
  assert(Size==_Nprocessors);
  _processor_coor.resize(_ndimension);
  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank  (communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
 };
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 }
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int dest,
 						       void *recv,
 						       int from,
 						       int bytes)
 {
  uint64_t xmit_i = (uint64_t) xmit;
  uint64_t recv_i = (uint64_t) recv;
  uint64_t shm    = (uint64_t) ShmCommBuf;
  // assert xmit and recv lie in shared memory region
  assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
  assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
  assert(from!=_processor);
  assert(dest!=_processor);
  MPIoffloadEngine::QueueMultiplexedSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
  //MPIoffloadEngine::QueueRoundRobinSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
  //MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
  //MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  MPIoffloadEngine::WaitAll();
  //this->Barrier();
 }
 void CartesianCommunicator::StencilBarrier(void) { }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
 void *CartesianCommunicator::ShmBuffer(int rank) {
  return NULL;
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
  return NULL;
 }
 };
--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@@ -1,273 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/ActionCore.h>
 #include <mpi.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  int provided;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
    if ( provided != MPI_THREAD_MULTIPLE ) {
      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
    }
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  ShmInitGeneric();
 }
 CartesianCommunicator::~CartesianCommunicator()
 {
  int MPI_is_finalised;
  MPI_Finalized(&MPI_is_finalised);
  if (communicator && !MPI_is_finalised){
    MPI_Comm_free(&communicator);
    for(int i=0;i<  communicator_halo.size();i++){
      MPI_Comm_free(&communicator_halo[i]);
    }
  }  
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  int myrank = _processor;
  int ierr;
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    MPI_Request xrq;
    MPI_Request rrq;
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else { 
    // Give the CPU to MPI immediately; can use threads to overlap optionally
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		      recv,bytes,MPI_CHAR,from, from,
 		      communicator,MPI_STATUS_IGNORE);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    int nreq=list.size();
    std::vector<MPI_Status> status(nreq);
    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
  ///////////////////////////////////////////////////////
  // Should only be used prior to Grid Init finished.
  // Check for this?
  ///////////////////////////////////////////////////////
 int CartesianCommunicator::RankWorld(void){ 
  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes,int dir)
 {
  int myrank = _processor;
  int ierr;
  int ncomm  =communicator_halo.size(); 
  int commdir=dir%ncomm;
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[commdir],&req[0]);
  list.push_back(req[0]);
  list.push_back(req[1]);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 { 
  int nreq=waitall.size();
  MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
 }
 double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
 						    int xmit_to_rank,
 						    void *recv,
 						    int recv_from_rank,
 						    int bytes,int dir)
 {
  int myrank = _processor;
  int ierr;
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo.size()<< <std::endl;
  int ncomm  =communicator_halo.size(); 
  int commdir=dir%ncomm;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[commdir],&req[0]);
  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
  return 2.0*bytes;
 }
 }
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -32,14 +32,22 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
-  ShmInitGeneric();
+  GlobalSharedMemory::Init(communicator_world);
  GlobalSharedMemory::SharedMemoryAllocate(
 		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
 		   GlobalSharedMemory::Hugepages);
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) 
-  : CartesianCommunicator(processors) { srank=0;}
+  : CartesianCommunicator(processors) 
 {
  srank=0;
  SetCommunicator(communicator_world);
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
@@ -54,6 +62,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
    assert(_processors[d]==1);
    _processor_coor[d] = 0;
  }
  SetCommunicator(communicator_world);
 }
 CartesianCommunicator::~CartesianCommunicator(){}
@@ -121,6 +130,36 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
  dest=0;
 }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,
 						     void *recv,
 						     int recv_from_rank,
 						     int bytes, int dir)
 {
  std::vector<CommsRequest_t> list;
  // Discard the "dir"
  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
  // Discard the "dir"
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void){};
 }
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -1,357 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_shmem.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <mpp/shmem.h>
 #include <array>
 namespace Grid {
  // Should error check all MPI calls.
 #define SHMEM_VET(addr) 
 #define SHMEM_VET_DEBUG(addr) {				\
  if ( ! shmem_addr_accessible(addr,_processor) ) {\
    std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
    BACKTRACEFILE();		   \
  }\
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 typedef struct HandShake_t { 
  uint64_t seq_local;
  uint64_t seq_remote;
 } HandShake;
 std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) {
  std::array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
  ret.fill(SHMEM_SYNC_VALUE);
  return ret;
 }
 static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync_init = make_psync_init();
 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  shmem_init();
  XConnections.resize(shmem_n_pes());
  RConnections.resize(shmem_n_pes());
  for(int pe =0 ; pe<shmem_n_pes();pe++){
    XConnections[pe].seq_local = 0;
    XConnections[pe].seq_remote= 0;
    RConnections[pe].seq_local = 0;
    RConnections[pe].seq_remote= 0;
  }
  shmem_barrier_all();
  ShmInitGeneric();
 }
 CartesianCommunicator::~CartesianCommunicator(){}
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
  : CartesianCommunicator(processors) 
 {
  std::cout << "Attempts to split SHMEM communicators will fail " <<std::endl;
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();
  std::vector<int> periodic(_ndimension,1);
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  _processor = shmem_my_pe();
  Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  int Size = shmem_n_pes(); 
  assert(Size==_Nprocessors);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  static long long source ;
  static long long dest   ;
  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  //  int nreduce=1;
  //  int pestart=0;
  //  int logStride=0;
  source = u;
  dest   = 0;
  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
  shmem_barrier_all(); // necessary?
  u = dest;
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  static long long source ;
  static long long dest   ;
  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  //  int nreduce=1;
  //  int pestart=0;
  //  int logStride=0;
  source = u;
  dest   = 0;
  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
  shmem_barrier_all(); // necessary?
  u = dest;
 }
 void CartesianCommunicator::GlobalSum(float &f){
  static float source ;
  static float dest   ;
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  source = f;
  dest   =0.0;
  shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
  shmem_barrier_all();
  f = dest;
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  static float source ;
  static float dest   = 0 ;
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  if ( shmem_addr_accessible(f,_processor)  ){
    shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync.data());
    shmem_barrier_all();
    return;
  }
  for(int i=0;i<N;i++){
    dest   =0.0;
    source = f[i];
    shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
    shmem_barrier_all();
    f[i] = dest;
  }
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  static double source;
  static double dest  ;
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  source = d;
  dest   = 0;
  shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
  shmem_barrier_all();
  d = dest;
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  static double source ;
  static double dest   ;
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  if ( shmem_addr_accessible(d,_processor)  ){
    shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync.data());
    shmem_barrier_all();
    return;
  }
  for(int i=0;i<N;i++){
    source = d[i];
    dest   =0.0;
    shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
    shmem_barrier_all();
    d[i] = dest;
  }
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  std::vector<int> coor = _processor_coor;
  assert(std::abs(shift) <_processors[dim]);
  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,source,_processors);
  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,dest,_processors);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  Lexicographic::IndexFromCoor(coor,rank,_processors);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  Lexicographic::CoorFromIndex(coor,rank,_processors);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  SHMEM_VET(xmit);
  SHMEM_VET(recv);
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  static uint64_t seq;
  assert(recv!=xmit);
  volatile HandShake *RecvSeq = (volatile HandShake *) & RConnections[sender];
  volatile HandShake *SendSeq = (volatile HandShake *) & XConnections[receiver];
  if ( _processor == sender ) {
    // Check he has posted a receive
    while(SendSeq->seq_remote == SendSeq->seq_local);
    // Advance our send count
    seq = ++(SendSeq->seq_local);
    // Send this packet 
    SHMEM_VET(recv);
    shmem_putmem(recv,xmit,bytes,receiver);
    shmem_fence();
    //Notify him we're done
    shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
    shmem_fence();
  }
  if ( _processor == receiver ) {
    // Post a receive
    seq = ++(RecvSeq->seq_local);
    shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
    // Now wait until he has advanced our reception counter
    while(RecvSeq->seq_remote != RecvSeq->seq_local);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  SHMEM_VET(xmit);
  SHMEM_VET(recv);
  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
  shmem_putmem(recv,xmit,bytes,dest);
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) shmem_barrier_all(); 
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  //  shmem_quiet();      // I'm done
  if( CommunicatorPolicy == CommunicatorPolicyConcurrent ) shmem_barrier_all();// He's done too
 }
 void CartesianCommunicator::Barrier(void)
 {
  shmem_barrier_all();
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  static uint32_t word;
  uint32_t *array = (uint32_t *) data;
  assert( (bytes % 4)==0);
  int words = bytes/4;
  if ( shmem_addr_accessible(data,_processor)  ){
    shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync.data());
    return;
  }
  for(int w=0;w<words;w++){
    word = array[w];
    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data());
    if ( shmem_my_pe() != root ) {
      array[w] = word;
    }
    shmem_barrier_all();
  }
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  static uint32_t word;
  uint32_t *array = (uint32_t *) data;
  assert( (bytes % 4)==0);
  int words = bytes/4;
  for(int w=0;w<words;w++){
    word = array[w];
    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data());
    if ( shmem_my_pe() != root ) {
      array[w]= word;
    }
    shmem_barrier_all();
  }
 }
 int CartesianCommunicator::RankWorld(void){ 
  return shmem_my_pe();
 }
 }
--- a/Show More
+++ b/Show More