Merge branch 'feature/hadrons' of https://github.com/paboyle/Grid into feature/hadrons

2025-07-19 06:07:06 +01:00 · 2018-03-19 17:57:13 +00:00
parent 62702dbcb8 41d6cab033
commit f212b0a963
140 changed files with 8207 additions and 1087 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -123,3 +123,10 @@ make-bin-BUCK.sh
 #####################
 lib/qcd/spin/gamma-gen/*.h
 lib/qcd/spin/gamma-gen/*.cc
 lib/version.h
 # vs code editor files #
 ########################
 .vscode/
 .vscode/settings.json
 settings.json
--- a/.travis.yml
+++ b/.travis.yml
@@ -44,3 +44,4 @@ script:
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
--- a/Makefile.am
+++ b/Makefile.am
@@ -5,6 +5,10 @@ include $(top_srcdir)/doxygen.inc
 bin_SCRIPTS=grid-config
 BUILT_SOURCES = version.h
 version.h:
 	echo "`git log -n 1 --format=format:"#define GITHASH \\"%H:%d\\"%n" HEAD`" > $(srcdir)/lib/version.h
 .PHONY: bench check tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
--- a/README.md
+++ b/README.md
@@ -187,10 +187,11 @@ Alternatively, some CPU codenames can be directly used:
 | `<code>`    | Description                            |
 | ----------- | -------------------------------------- |
 | `KNL`       | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
 | `SKL`       | [Intel Skylake with AVX512 extensions](https://ark.intel.com/products/codename/37572/Skylake#@server) |
 | `BGQ`       | Blue Gene/Q                            |
 #### Notes:
- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
+- We currently support AVX512 for the Intel compiler and GCC (KNL and SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced.
 - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
 - BG/Q performances are currently rather poor. This is being investigated for future versions.
 - The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`.
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -48,7 +48,6 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
  int Ls=16;
@@ -57,6 +56,10 @@ int main (int argc, char ** argv)
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
  GridLogLayout();
  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -187,7 +190,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
@@ -226,7 +229,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -277,7 +280,7 @@ int main (int argc, char ** argv)
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -355,7 +358,7 @@ int main (int argc, char ** argv)
      //      sDw.stat.print();
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
+      double flops=(single_site_flops*volume*ncall)/2.0;
      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
      std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
@@ -478,7 +481,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(1344.0*volume*ncall)/2;
+    double flops=(single_site_flops*volume*ncall)/2.0;
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -51,6 +51,7 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
@@ -107,6 +108,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
@@ -196,7 +198,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  if ( ! report ) {
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
  }
@@ -228,7 +230,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
    if(!report){
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
+      double flops=(single_site_flops*volume*ncall)/2.0;
      std::cout<< flops/(t1-t0);
    }
  }
@@ -237,6 +239,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
 #define CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {
  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -321,7 +324,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
    Counter.Report();
  } else { 
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
    std::cout<<"\t"<< flops/(t1-t0);
  }
@@ -358,7 +361,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
    CounterSdw.Report();
  } else {
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(1344.0*volume*ncall)/2;
+    double flops=(single_site_flops*volume*ncall)/2.0;
    std::cout<<"\t"<< flops/(t1-t0);
  }
 }
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@@ -107,7 +107,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
@@ -134,7 +134,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;
    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -174,7 +174,7 @@ int main (int argc, char ** argv)
    FGrid_d->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -4,7 +4,7 @@
    Source file: ./benchmarks/Benchmark_wilson.cc
-    Copyright (C) 2015
+    Copyright (C) 2018
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
@@ -32,6 +32,9 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 #include "Grid/util/Profiling.h"
 template<class d>
 struct scal {
  d internal;
@@ -45,6 +48,7 @@ struct scal {
  };
 bool overlapComms = false;
 bool perfProfiling = false;
 int main (int argc, char ** argv)
 {
@@ -53,6 +57,12 @@ int main (int argc, char ** argv)
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
    overlapComms = true;
  }
  if( GridCmdOptionExists(argv,argv+argc,"--perf") ){
    perfProfiling = true;
  }
  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
@@ -61,10 +71,15 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian     RBGrid(&Grid);
  int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
  GridLogLayout();
  std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
  std::cout<<GridLogMessage << "Grid number of colours : "<< QCD::Nc <<std::endl;
  std::cout<<GridLogMessage << "Benchmarking Wilson operator in the fundamental representation" << std::endl;
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);
@@ -134,9 +149,25 @@ int main (int argc, char ** argv)
    Dw.Dhop(src,result,0);
  }
  double t1=usecond();
-  double flops=1344*volume*ncall;
+  double flops=single_site_flops*volume*ncall;
  if (perfProfiling){
  std::cout<<GridLogMessage << "Profiling Dw with perf"<<std::endl;
  System::profile("kernel", [&]() {
    for(int i=0;i<ncall;i++){
      Dw.Dhop(src,result,0);
    }
  });
  std::cout<<GridLogMessage << "Generated kernel.data"<<std::endl;
  std::cout<<GridLogMessage << "Use with: perf report -i kernel.data"<<std::endl;
  }
  std::cout<<GridLogMessage << "Called Dw"<<std::endl;
  std::cout<<GridLogMessage << "flops per site " << single_site_flops << std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -62,6 +62,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Number of colours "<< QCD::Nc <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
@@ -69,13 +70,15 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage << "* OpenMP threads       : "<< GridThread::GetThreads() <<std::endl;
  std::cout << GridLogMessage << "* MPI tasks            : "<< GridCmdVectorIntToString(mpi_layout) << std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking Wilson operator in the fundamental representation" << std::endl;
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
-  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
+  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs\tWilsonEO/MFLOPs\tWilsonDagEO/MFLOPs" << std::endl;
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
  int Lmax = 32;
  int dmin = 0;
@@ -97,13 +100,20 @@ int main (int argc, char ** argv)
 	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
 	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
-	  LatticeFermion    src(&Grid); random(pRNG,src);
+	  LatticeFermion        src(&Grid); random(pRNG,src);
-	  LatticeFermion result(&Grid); result=zero;
+	  LatticeFermion    src_o(&RBGrid); pickCheckerboard(Odd,src_o,src);
 	  LatticeFermion     result(&Grid); result=zero;
 	  LatticeFermion result_e(&RBGrid); result_e=zero;
 	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
 	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
-      
+
    // Full operator      
 	  bench_wilson(src,result,Dw,volume,DaggerNo);
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
    std::cout << "\t";
    // EO
 	  bench_wilson(src,result,Dw,volume,DaggerNo);
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
 	  std::cout << std::endl;
@@ -122,9 +132,26 @@ void bench_wilson (
 		   int const           dag )
 {
  int ncall    = 1000;
  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  double t0    = usecond();
  for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
  double t1    = usecond();
-  double flops = 1344 * volume * ncall;
+  double flops = single_site_flops * volume * ncall;
  std::cout << flops/(t1-t0) << "\t\t";
 }
 void bench_wilson_eo (
 		   LatticeFermion &    src,
 		   LatticeFermion & result,
 		   WilsonFermionR &     Dw,
 		   double const     volume,
 		   int const           dag )
 {
  int ncall    = 1000;
  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  double t0    = usecond();
  for(int i=0; i<ncall; i++) { Dw.DhopEO(src,result,dag); }
  double t1    = usecond();
  double flops = (single_site_flops * volume * ncall)/2.0;
  std::cout << flops/(t1-t0) << "\t\t";
 }
--- a/configure.ac
+++ b/configure.ac
@@ -249,6 +249,9 @@ case ${ax_cv_cxx_compiler_vendor} in
      AVX512)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
      SKL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for SkyLake Xeon])
        SIMD_FLAGS='-march=skylake-avx512';;
      KNC)
        AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
        SIMD_FLAGS='';;
@@ -337,7 +340,7 @@ case ${ac_PRECISION} in
 esac
 ######################  Shared memory allocation technique under MPI3
-AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|hugetlbfs],
+AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|hugetlbfs|shmnone],
              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
 case ${ac_SHM} in
@@ -346,6 +349,10 @@ case ${ac_SHM} in
     AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
     ;;
     shmnone)
     AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] )
     ;;
     hugetlbfs)
     AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
     ;;
--- a/extras/Hadrons/Application.cc
+++ b/extras/Hadrons/Application.cc
@@ -43,12 +43,6 @@ using namespace Hadrons;
 Application::Application(void)
 {
    initLogger();
    LOG(Message) << "Modules available:" << std::endl;
    auto list = ModuleFactory::getInstance().getBuilderList();
    for (auto &m: list)
    {
        LOG(Message) << "  " << m << std::endl;
    }
    auto dim = GridDefaultLatt(), mpi = GridDefaultMpi(), loc(dim);
    locVol_ = 1;
    for (unsigned int d = 0; d < dim.size(); ++d)
@@ -138,24 +132,27 @@ void Application::parseParameterFile(const std::string parameterFileName)
 void Application::saveParameterFile(const std::string parameterFileName)
 {
    XmlWriter          writer(parameterFileName);
    ObjectId           id;
    const unsigned int nMod = vm().getNModule();
    LOG(Message) << "Saving application to '" << parameterFileName << "'..." << std::endl;
-    write(writer, "parameters", getPar());
+    if (env().getGrid()->IsBoss())
    push(writer, "modules");
    for (unsigned int i = 0; i < nMod; ++i)
    {
-        push(writer, "module");
+        XmlWriter          writer(parameterFileName);
-        id.name = vm().getModuleName(i);
+        ObjectId           id;
-        id.type = vm().getModule(i)->getRegisteredName();
+        const unsigned int nMod = vm().getNModule();
-        write(writer, "id", id);
+
-        vm().getModule(i)->saveParameters(writer, "options");
+        write(writer, "parameters", getPar());
        push(writer, "modules");
        for (unsigned int i = 0; i < nMod; ++i)
        {
            push(writer, "module");
            id.name = vm().getModuleName(i);
            id.type = vm().getModule(i)->getRegisteredName();
            write(writer, "id", id);
            vm().getModule(i)->saveParameters(writer, "options");
            pop(writer);
        }
        pop(writer);
        pop(writer);
    }
    pop(writer);
    pop(writer);
 }
 // schedule computation ////////////////////////////////////////////////////////
@@ -170,20 +167,24 @@ void Application::schedule(void)
 void Application::saveSchedule(const std::string filename)
 {
    TextWriter               writer(filename);
    std::vector<std::string> program;
    if (!scheduled_)
    {
        HADRON_ERROR(Definition, "Computation not scheduled");
    }
    LOG(Message) << "Saving current schedule to '" << filename << "'..."
                 << std::endl;
-    for (auto address: program_)
+    if (env().getGrid()->IsBoss())
    {
-        program.push_back(vm().getModuleName(address));
+        TextWriter               writer(filename);
        std::vector<std::string> program;
        if (!scheduled_)
        {
            HADRON_ERROR(Definition, "Computation not scheduled");
        }
        for (auto address: program_)
        {
            program.push_back(vm().getModuleName(address));
        }
        write(writer, "schedule", program);
    }
    write(writer, "schedule", program);
 }
 void Application::loadSchedule(const std::string filename)
--- a/extras/Hadrons/EigenPack.hpp
+++ b/extras/Hadrons/EigenPack.hpp
@@ -0,0 +1,218 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/EigenPack.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_EigenPack_hpp_
 #define Hadrons_EigenPack_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
 BEGIN_HADRONS_NAMESPACE
 // Lanczos type
 #ifndef HADRONS_DEFAULT_LANCZOS_NBASIS
 #define HADRONS_DEFAULT_LANCZOS_NBASIS 60
 #endif
 template <typename F>
 class EigenPack
 {
 public:
    typedef F Field;
 public:
    std::vector<RealD> eval;
    std::vector<F>     evec;
 public:
    EigenPack(void)          = default;
    virtual ~EigenPack(void) = default;
    EigenPack(const size_t size, GridBase *grid)
    {
        resize(size, grid);
    }
    void resize(const size_t size, GridBase *grid)
    {
        eval.resize(size);
        evec.resize(size, grid);
    }
    virtual void read(const std::string fileStem, const int traj = -1)
    {
        std::string evecFilename, evalFilename;
        makeFilenames(evecFilename, evalFilename, fileStem, traj);
        XmlReader xmlReader(evalFilename);
        basicRead(evec, evecFilename, evec.size());
        LOG(Message) << "Reading " << eval.size() << " eigenvalues from '" 
                     << evalFilename << "'" << std::endl;
        Grid::read(xmlReader, "evals", eval);
    }
    virtual void write(const std::string fileStem, const int traj = -1)
    {
        std::string evecFilename, evalFilename;
        makeFilenames(evecFilename, evalFilename, fileStem, traj);
        XmlWriter xmlWriter(evalFilename);
        basicWrite(evecFilename, evec, evec.size());
        LOG(Message) << "Writing " << eval.size() << " eigenvalues to '" 
                     << evalFilename << "'" << std::endl;
        Grid::write(xmlWriter, "evals", eval);
    }
 protected:
    void makeFilenames(std::string &evecFilename, std::string &evalFilename,
                       const std::string stem, const int traj = -1)
    {
        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
        evecFilename = stem + "_evec" + t + ".bin";
        evalFilename = stem + "_eval" + t + ".xml";
    }
    template <typename T>
    static void basicRead(std::vector<T> &evec, const std::string filename, 
                          const unsigned int size)
    {
        emptyUserRecord record;
        ScidacReader    binReader;
        binReader.open(filename);
        for(int k = 0; k < size; ++k) 
        {
            binReader.readScidacFieldRecord(evec[k], record);
        }
        binReader.close();
    }
    template <typename T>
    static void basicWrite(const std::string filename, std::vector<T> &evec, 
                           const unsigned int size)
    {
        emptyUserRecord record;
        ScidacWriter    binWriter;
        binWriter.open(filename);
        for(int k = 0; k < size; ++k) 
        {
            binWriter.writeScidacFieldRecord(evec[k], record);
        }
        binWriter.close();
    }
 };
 template <typename FineF, typename CoarseF>
 class CoarseEigenPack: public EigenPack<FineF>
 {
 public:
    typedef CoarseF CoarseField;
 public:
    std::vector<RealD>   evalCoarse;
    std::vector<CoarseF> evecCoarse;
 public:
    CoarseEigenPack(void)          = default;
    virtual ~CoarseEigenPack(void) = default;
    CoarseEigenPack(const size_t sizeFine, const size_t sizeCoarse, 
                    GridBase *gridFine, GridBase *gridCoarse)
    {
        resize(sizeFine, sizeCoarse, gridFine, gridCoarse);
    }
    void resize(const size_t sizeFine, const size_t sizeCoarse, 
                GridBase *gridFine, GridBase *gridCoarse)
    {
        EigenPack<FineF>::resize(sizeFine, gridFine);
        evalCoarse.resize(sizeCoarse);
        evecCoarse.resize(sizeCoarse, gridCoarse);
    }
    virtual void read(const std::string fileStem, const int traj = -1)
    {
        std::string evecFineFilename, evalFineFilename;
        std::string evecCoarseFilename, evalCoarseFilename;
        this->makeFilenames(evecFineFilename, evalFineFilename, 
                            fileStem + "_fine", traj);
        this->makeFilenames(evecCoarseFilename, evalCoarseFilename, 
                            fileStem + "_coarse", traj);
        XmlReader xmlFineReader(evalFineFilename);
        XmlReader xmlCoarseReader(evalCoarseFilename);
        LOG(Message) << "Reading " << this->evec.size() << " fine eigenvectors from '" 
                     << evecFineFilename << "'" << std::endl;
        this->basicRead(this->evec, evecFineFilename, this->evec.size());
        LOG(Message) << "Reading " << evecCoarse.size() << " coarse eigenvectors from '" 
                     << evecCoarseFilename << "'" << std::endl;
        this->basicRead(evecCoarse, evecCoarseFilename, evecCoarse.size());
        LOG(Message) << "Reading " << this->eval.size() << " fine eigenvalues from '" 
                     << evalFineFilename << "'" << std::endl;
        Grid::read(xmlFineReader, "evals", this->eval);
        LOG(Message) << "Reading " << evalCoarse.size() << " coarse eigenvalues from '" 
                     << evalCoarseFilename << "'" << std::endl;
        Grid::read(xmlCoarseReader, "evals", evalCoarse);
    }
    virtual void write(const std::string fileStem, const int traj = -1)
    {
        std::string evecFineFilename, evalFineFilename;
        std::string evecCoarseFilename, evalCoarseFilename;
        this->makeFilenames(evecFineFilename, evalFineFilename, 
                            fileStem + "_fine", traj);
        this->makeFilenames(evecCoarseFilename, evalCoarseFilename,
                            fileStem + "_coarse", traj);
        XmlWriter xmlFineWriter(evalFineFilename);
        XmlWriter xmlCoarseWriter(evalCoarseFilename);
        LOG(Message) << "Writing " << this->evec.size() << " fine eigenvectors to '" 
                     << evecFineFilename << "'" << std::endl;
        this->basicWrite(evecFineFilename, this->evec, this->evec.size());
        LOG(Message) << "Writing " << evecCoarse.size() << " coarse eigenvectors to '" 
                     << evecCoarseFilename << "'" << std::endl;
        this->basicWrite(evecCoarseFilename, evecCoarse, evecCoarse.size());
        LOG(Message) << "Writing " << this->eval.size() << " fine eigenvalues to '" 
                     << evalFineFilename << "'" << std::endl;
        Grid::write(xmlFineWriter, "evals", this->eval);
        LOG(Message) << "Writing " << evalCoarse.size() << " coarse eigenvalues to '" 
                     << evalCoarseFilename << "'" << std::endl;
        Grid::write(xmlCoarseWriter, "evals", evalCoarse);
    }
 };
 template <typename FImpl>
 using FermionEigenPack = EigenPack<typename FImpl::FermionField>;
 template <typename FImpl, int nBasis>
 using CoarseFermionEigenPack = CoarseEigenPack<
    typename FImpl::FermionField,
    typename LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
                                   typename FImpl::SiteComplex, 
                                   nBasis>::CoarseField>;
 END_HADRONS_NAMESPACE
 #endif // Hadrons_EigenPack_hpp_
--- a/extras/Hadrons/Environment.cc
+++ b/extras/Hadrons/Environment.cc
@@ -61,7 +61,7 @@ Environment::Environment(void)
 // grids ///////////////////////////////////////////////////////////////////////
 void Environment::createGrid(const unsigned int Ls)
 {
-    if (grid5d_.find(Ls) == grid5d_.end())
+    if ((Ls > 1) and (grid5d_.find(Ls) == grid5d_.end()))
    {
        auto g = getGrid();
@@ -70,6 +70,53 @@ void Environment::createGrid(const unsigned int Ls)
    }
 }
 void Environment::createCoarseGrid(const std::vector<int> &blockSize, 
                                   const unsigned int Ls)
 {
    int              nd      = getNd();
    std::vector<int> fineDim = getDim(), coarseDim;
    unsigned int     cLs;
    auto             key4d = blockSize, key5d = blockSize;
    createGrid(Ls);
    coarseDim.resize(nd);
    for (int d = 0; d < coarseDim.size(); d++)
    {
        coarseDim[d] = fineDim[d]/blockSize[d];
        if (coarseDim[d]*blockSize[d] != fineDim[d])
        {
            HADRON_ERROR(Size, "Fine dimension " + std::to_string(d) 
                         + " (" + std::to_string(fineDim[d]) 
                         + ") not divisible by coarse dimension ("
                         + std::to_string(coarseDim[d]) + ")"); 
        }
    }
    if (blockSize.size() > nd)
    {
        cLs = Ls/blockSize[nd];
        if (cLs*blockSize[nd] != Ls)
        {
            HADRON_ERROR(Size, "Fine Ls (" + std::to_string(Ls) 
                         + ") not divisible by coarse Ls ("
                         + std::to_string(cLs) + ")");
        }
        key4d.resize(nd);
        key5d.push_back(Ls);
    }
    gridCoarse4d_[key4d].reset(
        SpaceTimeGrid::makeFourDimGrid(coarseDim, 
            GridDefaultSimd(nd, vComplex::Nsimd()), GridDefaultMpi()));
    gridCoarseRb4d_[key4d].reset(
        SpaceTimeGrid::makeFourDimRedBlackGrid(gridCoarse4d_[key4d].get()));
    if (Ls > 1)
    {
        gridCoarse5d_[key5d].reset(
            SpaceTimeGrid::makeFiveDimGrid(cLs, gridCoarse4d_[key4d].get()));
        gridCoarseRb5d_[key5d].reset(
            SpaceTimeGrid::makeFiveDimRedBlackGrid(cLs, gridCoarse4d_[key4d].get()));
    }
 }
 GridCartesian * Environment::getGrid(const unsigned int Ls) const
 {
    try
@@ -104,7 +151,55 @@ GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls) const
    }
    catch(std::out_of_range &)
    {
-        HADRON_ERROR(Definition, "no red-black 5D grid with Ls= " + std::to_string(Ls));
+        HADRON_ERROR(Definition, "no red-black grid with Ls= " + std::to_string(Ls));
    }
 }
 GridCartesian * Environment::getCoarseGrid(
    const std::vector<int> &blockSize, const unsigned int Ls) const
 {
    auto key = blockSize;
    try
    {
        if (Ls == 1)
        {
            key.resize(getNd());
            return gridCoarse4d_.at(key).get();
        }
        else
        {
            key.push_back(Ls);
            return gridCoarse5d_.at(key).get();
        }
    }
    catch(std::out_of_range &)
    {
        HADRON_ERROR(Definition, "no coarse grid with Ls= " + std::to_string(Ls));
    }
 }
 GridRedBlackCartesian * Environment::getRbCoarseGrid(
    const std::vector<int> &blockSize, const unsigned int Ls) const
 {
    auto key = blockSize;
    try
    {
        if (Ls == 1)
        {
            key.resize(getNd());
            return gridCoarseRb4d_.at(key).get();
        }
        else
        {
            key.push_back(Ls);
            return gridCoarseRb5d_.at(key).get();
        }
    }
    catch(std::out_of_range &)
    {
        HADRON_ERROR(Definition, "no coarse red-black grid with Ls= " + std::to_string(Ls));
    }
 }
@@ -270,7 +365,7 @@ int Environment::getObjectModule(const std::string name) const
 unsigned int Environment::getObjectLs(const unsigned int address) const
 {
-    if (hasObject(address))
+    if (hasCreatedObject(address))
    {
        return object_[address].Ls;
    }
--- a/extras/Hadrons/Environment.hpp
+++ b/extras/Hadrons/Environment.hpp
@@ -78,7 +78,7 @@ private:
        Size                    size{0};
        Storage                 storage{Storage::object};
        unsigned int            Ls{0};
-        const std::type_info    *type{nullptr};
+        const std::type_info    *type{nullptr}, *derivedType{nullptr};
        std::string             name;
        int                     module{-1};
        std::unique_ptr<Object> data{nullptr};
@@ -86,8 +86,14 @@ private:
 public:
    // grids
    void                    createGrid(const unsigned int Ls);
    void                    createCoarseGrid(const std::vector<int> &blockSize,
                                             const unsigned int Ls = 1);
    GridCartesian *         getGrid(const unsigned int Ls = 1) const;
    GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const;
    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize,
                                          const unsigned int Ls = 1) const;
    GridRedBlackCartesian * getRbCoarseGrid(const std::vector<int> &blockSize,
                                            const unsigned int Ls = 1) const;
    std::vector<int>        getDim(void) const;
    int                     getDim(const unsigned int mu) const;
    unsigned long int       getLocalVolume(void) const;
@@ -110,6 +116,10 @@ public:
                                         Ts && ... args);
    void                    setObjectModule(const unsigned int objAddress,
                                            const int modAddress);
    template <typename B, typename T>
    T *                     getDerivedObject(const unsigned int address) const;
    template <typename B, typename T>
    T *                     getDerivedObject(const std::string name) const;
    template <typename T>
    T *                     getObject(const unsigned int address) const;
    template <typename T>
@@ -155,6 +165,10 @@ private:
    std::map<unsigned int, GridPt>         grid5d_;
    GridRbPt                               gridRb4d_;
    std::map<unsigned int, GridRbPt>       gridRb5d_;
    std::map<std::vector<int>, GridPt>     gridCoarse4d_;
    std::map<std::vector<int>, GridRbPt>   gridCoarseRb4d_;
    std::map<std::vector<int>, GridPt>     gridCoarse5d_;
    std::map<std::vector<int>, GridRbPt>   gridCoarseRb5d_;
    unsigned int                           nd_;
    // random number generator
    RngPt                                  rng4d_;
@@ -176,7 +190,7 @@ Holder<T>::Holder(T *pt)
 template <typename T>
 T & Holder<T>::get(void) const
 {
-    return &objPt_.get();
+    return *objPt_.get();
 }
 template <typename T>
@@ -216,22 +230,24 @@ void Environment::createDerivedObject(const std::string name,
        {
            MemoryProfiler::stats = &memStats;
        }
-        size_t initMem           = MemoryProfiler::stats->currentlyAllocated;
+        size_t initMem               = MemoryProfiler::stats->currentlyAllocated;
-        object_[address].storage = storage;
+        object_[address].storage     = storage;
-        object_[address].Ls      = Ls;
+        object_[address].Ls          = Ls;
        object_[address].data.reset(new Holder<B>(new T(std::forward<Ts>(args)...)));
-        object_[address].size    = MemoryProfiler::stats->maxAllocated - initMem;
+        object_[address].size        = MemoryProfiler::stats->maxAllocated - initMem;
-        object_[address].type    = &typeid(T);
+        object_[address].type        = &typeid(B);
        object_[address].derivedType = &typeid(T);
        if (MemoryProfiler::stats == &memStats)
        {
            MemoryProfiler::stats = nullptr;
        }
    }
    // object already exists, no error if it is a cache, error otherwise
-    else if ((object_[address].storage != Storage::cache) or 
+    else if ((object_[address].storage     != Storage::cache) or 
-             (object_[address].storage != storage)        or
+             (object_[address].storage     != storage)        or
-             (object_[address].name    != name)           or
+             (object_[address].name        != name)           or
-             (object_[address].type    != &typeid(T)))
+             (object_[address].type        != &typeid(B))     or
             (object_[address].derivedType != &typeid(T)))
    {
        HADRON_ERROR(Definition, "object '" + name + "' already allocated");
    }
@@ -246,21 +262,37 @@ void Environment::createObject(const std::string name,
    createDerivedObject<T, T>(name, storage, Ls, std::forward<Ts>(args)...);
 }
-template <typename T>
+template <typename B, typename T>
-T * Environment::getObject(const unsigned int address) const
+T * Environment::getDerivedObject(const unsigned int address) const
 {
    if (hasObject(address))
    {
        if (hasCreatedObject(address))
        {
-            if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
+            if (auto h = dynamic_cast<Holder<B> *>(object_[address].data.get()))
            {
-                return h->getPt();
+                if (&typeid(T) == &typeid(B))
                {
                    return dynamic_cast<T *>(h->getPt());
                }
                else
                {
                    if (auto hder = dynamic_cast<T *>(h->getPt()))
                    {
                        return hder;
                    }
                    else
                    {
                        HADRON_ERROR(Definition, "object with address " + std::to_string(address) +
                            " cannot be casted to '" + typeName(&typeid(T)) +
                            "' (has type '" + typeName(&typeid(h->get())) + "')");
                    }
                }
            }
            else
            {
                HADRON_ERROR(Definition, "object with address " + std::to_string(address) +
-                            " does not have type '" + typeName(&typeid(T)) +
+                            " does not have type '" + typeName(&typeid(B)) +
                            "' (has type '" + getObjectType(address) + "')");
            }
        }
@@ -276,6 +308,18 @@ T * Environment::getObject(const unsigned int address) const
    }
 }
 template <typename B, typename T>
 T * Environment::getDerivedObject(const std::string name) const
 {
    return getDerivedObject<B, T>(getObjectAddress(name));
 }
 template <typename T>
 T * Environment::getObject(const unsigned int address) const
 {
    return getDerivedObject<T, T>(address);
 }
 template <typename T>
 T * Environment::getObject(const std::string name) const
 {
--- a/extras/Hadrons/Exceptions.hpp
+++ b/extras/Hadrons/Exceptions.hpp
@@ -37,7 +37,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define SRC_LOC std::string(__FUNCTION__) + " at " + std::string(__FILE__) + ":"\
                + std::to_string(__LINE__)
 #define HADRON_ERROR(exc, msg)\
 LOG(Error) << msg << std::endl;\
 throw(Exceptions::exc(msg, SRC_LOC));
 #define DECL_EXC(name, base) \
--- a/extras/Hadrons/GeneticScheduler.hpp
+++ b/extras/Hadrons/GeneticScheduler.hpp
@@ -130,7 +130,7 @@ void GeneticScheduler<V, T>::nextGeneration(void)
    {
        initPopulation();
    }
-    LOG(Debug) << "Starting population:\n" << *this << std::endl;
+    //LOG(Debug) << "Starting population:\n" << *this << std::endl;
    // random mutations
    //PARALLEL_FOR_LOOP
@@ -138,7 +138,7 @@ void GeneticScheduler<V, T>::nextGeneration(void)
    {
        doMutation();
    }
-    LOG(Debug) << "After mutations:\n" << *this << std::endl;
+    //LOG(Debug) << "After mutations:\n" << *this << std::endl;
    // mating
    //PARALLEL_FOR_LOOP
@@ -146,14 +146,14 @@ void GeneticScheduler<V, T>::nextGeneration(void)
    {
        doCrossover();
    }
-    LOG(Debug) << "After mating:\n" << *this << std::endl;
+    //LOG(Debug) << "After mating:\n" << *this << std::endl;
    // grim reaper
    auto it = population_.begin();
    std::advance(it, par_.popSize);
    population_.erase(it, population_.end());
-    LOG(Debug) << "After grim reaper:\n" << *this << std::endl;
+    //LOG(Debug) << "After grim reaper:\n" << *this << std::endl;
 }
 // evolution steps /////////////////////////////////////////////////////////////
--- a/extras/Hadrons/Global.cc
+++ b/extras/Hadrons/Global.cc
@@ -37,20 +37,38 @@ HadronsLogger Hadrons::HadronsLogWarning(1,"Warning");
 HadronsLogger Hadrons::HadronsLogMessage(1,"Message");
 HadronsLogger Hadrons::HadronsLogIterative(1,"Iterative");
 HadronsLogger Hadrons::HadronsLogDebug(1,"Debug");
 HadronsLogger Hadrons::HadronsLogIRL(1,"IRL");
 void Hadrons::initLogger(void)
 {
-    auto w = std::string("Hadrons").length();
+    auto w  = std::string("Hadrons").length();
    int  cw = 8;
    GridLogError.setTopWidth(w);
    GridLogWarning.setTopWidth(w);
    GridLogMessage.setTopWidth(w);
    GridLogIterative.setTopWidth(w);
    GridLogDebug.setTopWidth(w);
    GridLogIRL.setTopWidth(w);
    GridLogError.setChanWidth(cw);
    GridLogWarning.setChanWidth(cw);
    GridLogMessage.setChanWidth(cw);
    GridLogIterative.setChanWidth(cw);
    GridLogDebug.setChanWidth(cw);
    GridLogIRL.setChanWidth(cw);
    HadronsLogError.Active(GridLogError.isActive());
    HadronsLogWarning.Active(GridLogWarning.isActive());
    HadronsLogMessage.Active(GridLogMessage.isActive());
    HadronsLogIterative.Active(GridLogIterative.isActive());
    HadronsLogDebug.Active(GridLogDebug.isActive());
    HadronsLogIRL.Active(GridLogIRL.isActive());
    HadronsLogError.setChanWidth(cw);
    HadronsLogWarning.setChanWidth(cw);
    HadronsLogMessage.setChanWidth(cw);
    HadronsLogIterative.setChanWidth(cw);
    HadronsLogDebug.setChanWidth(cw);
    HadronsLogIRL.setChanWidth(cw);
 }
 // type utilities //////////////////////////////////////////////////////////////
--- a/extras/Hadrons/Global.hpp
+++ b/extras/Hadrons/Global.hpp
@@ -43,12 +43,15 @@ See the full license in the file "LICENSE" in the top level distribution directo
 namespace Grid {\
 using namespace QCD;\
 namespace Hadrons {\
-using Grid::operator<<;
+using Grid::operator<<;\
 using Grid::operator>>;
 #define END_HADRONS_NAMESPACE }}
 #define BEGIN_MODULE_NAMESPACE(name)\
 namespace name {\
-using Grid::operator<<;
+using Grid::operator<<;\
 using Grid::operator>>;
 #define END_MODULE_NAMESPACE }
 /* the 'using Grid::operator<<;' statement prevents a very nasty compilation
@@ -58,6 +61,9 @@ using Grid::operator<<;
 #ifndef FIMPL
 #define FIMPL WilsonImplR
 #endif
 #ifndef ZFIMPL
 #define ZFIMPL ZWilsonImplR
 #endif
 #ifndef SIMPL
 #define SIMPL ScalarImplCR
 #endif
@@ -111,6 +117,7 @@ extern HadronsLogger HadronsLogWarning;
 extern HadronsLogger HadronsLogMessage;
 extern HadronsLogger HadronsLogIterative;
 extern HadronsLogger HadronsLogDebug;
 extern HadronsLogger HadronsLogIRL;
 void initLogger(void);
@@ -180,6 +187,18 @@ typedef XmlWriter ResultWriter;
 #define RESULT_FILE_NAME(name) \
 name + "." + std::to_string(vm().getTrajectory()) + "." + resultFileExt
 // default Schur convention
 #ifndef HADRONS_DEFAULT_SCHUR 
 #define HADRONS_DEFAULT_SCHUR DiagTwo
 #endif
 #define _HADRONS_SCHUR_OP_(conv) Schur##conv##Operator
 #define HADRONS_SCHUR_OP(conv) _HADRONS_SCHUR_OP_(conv)
 #define HADRONS_DEFAULT_SCHUR_OP HADRONS_SCHUR_OP(HADRONS_DEFAULT_SCHUR)
 #define _HADRONS_SCHUR_SOLVE_(conv) SchurRedBlack##conv##Solve
 #define HADRONS_SCHUR_SOLVE(conv) _HADRONS_SCHUR_SOLVE_(conv)
 #define HADRONS_DEFAULT_SCHUR_SOLVE HADRONS_SCHUR_SOLVE(HADRONS_DEFAULT_SCHUR)
 END_HADRONS_NAMESPACE
 #include <Grid/Hadrons/Exceptions.hpp>
--- a/extras/Hadrons/HadronsXmlRun.cc
+++ b/extras/Hadrons/HadronsXmlRun.cc
@@ -56,14 +56,26 @@ int main(int argc, char *argv[])
    Grid_init(&argc, &argv);
    // execution
-    Application application(parameterFileName);
+    try
    application.parseParameterFile(parameterFileName);
    if (!scheduleFileName.empty())
    {
-        application.loadSchedule(scheduleFileName);
+        Application application(parameterFileName);
        application.parseParameterFile(parameterFileName);
        if (!scheduleFileName.empty())
        {
            application.loadSchedule(scheduleFileName);
        }
        application.run();
    }
    catch (const std::exception& e)
    {
        LOG(Error) << "FATAL ERROR -- Exception " << typeName(&typeid(e)) << std::endl;
        LOG(Error) << e.what() << std::endl;
        LOG(Error) << "Aborting program" << std::endl;
        Grid_finalize();
        return EXIT_FAILURE;
    }
    application.run();
    // epilogue
    LOG(Message) << "Grid is finalizing now" << std::endl;
--- a/extras/Hadrons/HadronsXmlSchedule.cc
+++ b/extras/Hadrons/HadronsXmlSchedule.cc
@@ -1,65 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/HadronsXmlSchedule.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Application.hpp>
 using namespace Grid;
 using namespace QCD;
 using namespace Hadrons;
 int main(int argc, char *argv[])
 {
    // parse command line
    std::string parameterFileName, scheduleFileName;
    if (argc < 3)
    {
        std::cerr << "usage: " << argv[0] << " <parameter file> <schedule output> [Grid options]";
        std::cerr << std::endl;
        std::exit(EXIT_FAILURE);
    }
    parameterFileName = argv[1];
    scheduleFileName  = argv[2];
    // initialization
    Grid_init(&argc, &argv);
    // execution
    Application application;
    application.parseParameterFile(parameterFileName);
    application.schedule();
    application.printSchedule();
    application.saveSchedule(scheduleFileName);
    // epilogue
    LOG(Message) << "Grid is finalizing now" << std::endl;
    Grid_finalize();
    return EXIT_SUCCESS;
 }
--- a/extras/Hadrons/Makefile.am
+++ b/extras/Hadrons/Makefile.am
@@ -1,5 +1,5 @@
 lib_LIBRARIES = libHadrons.a
-bin_PROGRAMS  = HadronsXmlRun HadronsXmlSchedule
+bin_PROGRAMS  = HadronsXmlRun
 include modules.inc
@@ -21,6 +21,7 @@ nobase_libHadrons_a_HEADERS = \
 	GeneticScheduler.hpp      \
 	Global.hpp                \
 	Graph.hpp                 \
 	EigenPack.hpp          \
 	Module.hpp                \
 	Modules.hpp               \
 	ModuleFactory.hpp         \
@@ -28,6 +29,3 @@ nobase_libHadrons_a_HEADERS = \
 HadronsXmlRun_SOURCES = HadronsXmlRun.cc
 HadronsXmlRun_LDADD   = libHadrons.a -lGrid
 HadronsXmlSchedule_SOURCES = HadronsXmlSchedule.cc
 HadronsXmlSchedule_LDADD   = libHadrons.a -lGrid
--- a/extras/Hadrons/Module.hpp
+++ b/extras/Hadrons/Module.hpp
@@ -91,6 +91,9 @@ static ns##mod##ModuleRegistrar ns##mod##ModuleRegistrarInstance;
 #define envGet(type, name)\
 *env().template getObject<type>(name)
 #define envGetDerived(base, type, name)\
 *env().template getDerivedObject<base, type>(name)
 #define envGetTmp(type, var)\
 type &var = *env().template getObject<type>(getName() + "_tmp_" + #var)
@@ -137,6 +140,13 @@ envTmp(type, name, Ls, env().getGrid(Ls))
 #define envTmpLat(...)\
 MACRO_REDIRECT(__VA_ARGS__, envTmpLat5, envTmpLat4)(__VA_ARGS__)
 #define saveResult(ioStem, name, result)\
 if (env().getGrid()->IsBoss())\
 {\
    ResultWriter _writer(RESULT_FILE_NAME(ioStem));\
    write(_writer, name, result);\
 }
 /******************************************************************************
 *                            Module class                                    *
 ******************************************************************************/
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@@ -7,7 +7,9 @@ Source file: extras/Hadrons/Modules.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Lanny91 <andrew.lawson@gmail.com>
 Author: pretidav <david.preti@csic.es>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -43,9 +45,11 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Modules/MSource/SeqConserved.hpp>
 #include <Grid/Hadrons/Modules/MSink/Smear.hpp>
 #include <Grid/Hadrons/Modules/MSink/Point.hpp>
 #include <Grid/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 #include <Grid/Hadrons/Modules/MGauge/FundtoHirep.hpp>
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Grid/Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
 #include <Grid/Hadrons/Modules/MUtilities/TestSeqConserved.hpp>
@@ -55,8 +59,18 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MAction/DWF.hpp>
 #include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
 #include <Grid/Hadrons/Modules/MAction/WilsonClover.hpp>
 #include <Grid/Hadrons/Modules/MAction/ZMobiusDWF.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/Div.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/TrMag.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/EMT.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/TrPhi.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/Utils.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/TransProj.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
 #include <Grid/Hadrons/Modules/MIO/LoadEigenPack.hpp>
 #include <Grid/Hadrons/Modules/MIO/LoadNersc.hpp>
 #include <Grid/Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
 #include <Grid/Hadrons/Modules/MIO/LoadBinary.hpp>
--- a/extras/Hadrons/Modules/MAction/WilsonClover.hpp
+++ b/extras/Hadrons/Modules/MAction/WilsonClover.hpp
@@ -0,0 +1,154 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MAction/WilsonClover.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: pretidav <david.preti@csic.es>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MAction_WilsonClover_hpp_
 #define Hadrons_MAction_WilsonClover_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                            TWilson quark action                            *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MAction)
 class WilsonCloverPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonCloverPar,
                                    std::string, gauge,
                                    double     , mass,
 				                    double     , csw_r,
 				                    double     , csw_t,
 				                    WilsonAnisotropyCoefficients ,clover_anisotropy,
                                    std::string, boundary
 				    );
 };
 template <typename FImpl>
 class TWilsonClover: public Module<WilsonCloverPar>
 {
 public:
    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWilsonClover(const std::string name);
    // destructor
    virtual ~TWilsonClover(void) = default;
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(WilsonClover, TWilsonClover<FIMPL>, MAction);
 /******************************************************************************
 *                     TWilsonClover template implementation                        *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TWilsonClover<FImpl>::TWilsonClover(const std::string name)
 : Module<WilsonCloverPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TWilsonClover<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TWilsonClover<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWilsonClover<FImpl>::setup(void)
 {
    //unsigned int size;
    // size = 2*env().template lattice4dSize<typename FImpl::DoubledGaugeField>();
    // env().registerObject(getName(), size);
    LOG(Message) << "Setting up TWilsonClover fermion matrix with m= " << par().mass
                 << " using gauge field '" << par().gauge << "'" << std::endl;
    LOG(Message) << "Fermion boundary conditions: " << par().boundary 
                 << std::endl;
    LOG(Message) << "Clover term csw_r: " << par().csw_r
                 << " csw_t: " << par().csw_t
                 << std::endl;
    auto &U      = envGet(LatticeGaugeField, par().gauge);
    auto &grid   = *env().getGrid();
    auto &gridRb = *env().getRbGrid();
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename WilsonCloverFermion<FImpl>::ImplParams implParams(boundary);
    envCreateDerived(FMat, WilsonCloverFermion<FImpl>, getName(), 1, U, grid, gridRb, par().mass,
 						  par().csw_r,
 						  par().csw_t,
 					      par().clover_anisotropy,
 						  implParams); 
    //FMat *fMatPt = new WilsonCloverFermion<FImpl>(U, grid, gridRb, par().mass,
 	//					  par().csw_r,
 	//					  par().csw_t,
 	//				      par().clover_anisotropy,
 	//					  implParams);
    //env().setObject(getName(), fMatPt);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWilsonClover<FImpl>::execute()
 {
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_WilsonClover_hpp_
--- a/extras/Hadrons/Modules/MAction/ZMobiusDWF.hpp
+++ b/extras/Hadrons/Modules/MAction/ZMobiusDWF.hpp
@@ -0,0 +1,143 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MAction/ZMobiusDWF.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MAction_ZMobiusDWF_hpp_
 #define Hadrons_MAction_ZMobiusDWF_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         ZMobiusDWF                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MAction)
 class ZMobiusDWFPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ZMobiusDWFPar,
                                    std::string                      , gauge,
                                    unsigned int                     , Ls,
                                    double                           , mass,
                                    double                           , M5,
                                    double                           , b,
                                    double                           , c,
                                    std::vector<std::complex<double>>, omega,
                                    std::string                      , boundary);
 };
 template <typename FImpl>
 class TZMobiusDWF: public Module<ZMobiusDWFPar>
 {
 public:
    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TZMobiusDWF(const std::string name);
    // destructor
    virtual ~TZMobiusDWF(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(ZMobiusDWF, TZMobiusDWF<ZFIMPL>, MAction);
 /******************************************************************************
 *                 TZMobiusDWF implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TZMobiusDWF<FImpl>::TZMobiusDWF(const std::string name)
 : Module<ZMobiusDWFPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TZMobiusDWF<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TZMobiusDWF<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TZMobiusDWF<FImpl>::setup(void)
 {
    LOG(Message) << "Setting up z-Mobius domain wall fermion matrix with m= "
                 << par().mass << ", M5= " << par().M5 << ", Ls= " << par().Ls 
                 << ", b= " << par().b << ", c= " << par().c
                 << " using gauge field '" << par().gauge << "'"
                 << std::endl;
    LOG(Message) << "Omegas: " << std::endl;
    for (unsigned int i = 0; i < par().omega.size(); ++i)
    {
        LOG(Message) << "  omega[" << i << "]= " << par().omega[i] << std::endl;
    }
    LOG(Message) << "Fermion boundary conditions: " << par().boundary
                 << std::endl;
    env().createGrid(par().Ls);
    auto &U    = envGet(LatticeGaugeField, par().gauge);
    auto &g4   = *env().getGrid();
    auto &grb4 = *env().getRbGrid();
    auto &g5   = *env().getGrid(par().Ls);
    auto &grb5 = *env().getRbGrid(par().Ls);
    auto omega = par().omega;
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename ZMobiusFermion<FImpl>::ImplParams implParams(boundary);
    envCreateDerived(FMat, ZMobiusFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, omega,
                     par().b, par().c, implParams);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TZMobiusDWF<FImpl>::execute(void)
 {}
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MAction_ZMobiusDWF_hpp_
--- a/extras/Hadrons/Modules/MContraction/Baryon.hpp
+++ b/extras/Hadrons/Modules/MContraction/Baryon.hpp
@@ -122,7 +122,6 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
                 << " quarks '" << par().q1 << "', '" << par().q2 << "', and '"
                 << par().q3 << "'" << std::endl;
    ResultWriter writer(RESULT_FILE_NAME(par().output));
    auto       &q1 = envGet(PropagatorField1, par().q1);
    auto       &q2 = envGet(PropagatorField2, par().q2);
    auto       &q3 = envGet(PropagatorField3, par().q2);
@@ -131,7 +130,7 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
    // FIXME: do contractions
-    // write(writer, "meson", result);
+    // saveResult(par().output, "meson", result);
 }
 END_MODULE_NAMESPACE
--- a/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
+++ b/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
@@ -119,7 +119,6 @@ void TDiscLoop<FImpl>::execute(void)
                 << "' using '" << par().q_loop << "' with " << par().gamma 
                 << " insertion." << std::endl;
    ResultWriter          writer(RESULT_FILE_NAME(par().output));
    auto                  &q_loop = envGet(PropagatorField, par().q_loop);
    Gamma                 gamma(par().gamma);
    std::vector<TComplex> buf;
@@ -128,15 +127,13 @@ void TDiscLoop<FImpl>::execute(void)
    envGetTmp(LatticeComplex, c);
    c = trace(gamma*q_loop);
    sliceSum(c, buf, Tp);
    result.gamma = par().gamma;
    result.corr.resize(buf.size());
    for (unsigned int t = 0; t < buf.size(); ++t)
    {
        result.corr[t] = TensorRemove(buf[t]);
    }
-
+    saveResult(par().output, "disc", result);
    write(writer, "disc", result);
 }
 END_MODULE_NAMESPACE
--- a/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
+++ b/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
@@ -153,7 +153,6 @@ void TGamma3pt<FImpl1, FImpl2, FImpl3>::execute(void)
    // Initialise variables. q2 and q3 are normal propagators, q1 may be 
    // sink smeared.
    ResultWriter          writer(RESULT_FILE_NAME(par().output));
    auto                  &q1 = envGet(SlicedPropagator1, par().q1);
    auto                  &q2 = envGet(PropagatorField2, par().q2);
    auto                  &q3 = envGet(PropagatorField2, par().q3);
@@ -175,8 +174,7 @@ void TGamma3pt<FImpl1, FImpl2, FImpl3>::execute(void)
    {
        result.corr[t] = TensorRemove(buf[t]);
    }
-
+    saveResult(par().output, "gamma3pt", result);
    write(writer, "gamma3pt", result);
 }
 END_MODULE_NAMESPACE
--- a/extras/Hadrons/Modules/MContraction/Meson.hpp
+++ b/extras/Hadrons/Modules/MContraction/Meson.hpp
@@ -172,7 +172,6 @@ void TMeson<FImpl1, FImpl2>::execute(void)
                 << " quarks '" << par().q1 << "' and '" << par().q2 << "'"
                 << std::endl;
    ResultWriter           writer(RESULT_FILE_NAME(par().output));
    std::vector<TComplex>  buf;
    std::vector<Result>    result;
    Gamma                  g5(Gamma::Algebra::Gamma5);
@@ -239,7 +238,7 @@ void TMeson<FImpl1, FImpl2>::execute(void)
            }
        }
    }
-    write(writer, "meson", result);
+    saveResult(par().output, "meson", result);
 }
 END_MODULE_NAMESPACE
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
@@ -104,7 +104,6 @@ void TWeakHamiltonianEye::execute(void)
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
    ResultWriter           writer(RESULT_FILE_NAME(par().output));
    auto                   &q1 = envGet(SlicedPropagator, par().q1);
    auto                   &q2 = envGet(PropagatorField, par().q2);
    auto                   &q3 = envGet(PropagatorField, par().q3);
@@ -147,5 +146,6 @@ void TWeakHamiltonianEye::execute(void)
    SUM_MU(expbuf, E_body[mu]*E_loop[mu])
    MAKE_DIAG(expbuf, corrbuf, result[E_diag], "HW_E")
-    write(writer, "HW_Eye", result);
+    // IO
    saveResult(par().output, "HW_Eye", result);
 }
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
@@ -104,7 +104,6 @@ void TWeakHamiltonianNonEye::execute(void)
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
    ResultWriter          writer(RESULT_FILE_NAME(par().output));
    auto                  &q1 = envGet(PropagatorField, par().q1);
    auto                  &q2 = envGet(PropagatorField, par().q2);
    auto                  &q3 = envGet(PropagatorField, par().q3);
@@ -144,5 +143,6 @@ void TWeakHamiltonianNonEye::execute(void)
    SUM_MU(expbuf, W_i_side_loop[mu]*W_f_side_loop[mu])
    MAKE_DIAG(expbuf, corrbuf, result[W_diag], "HW_W")
-    write(writer, "HW_NonEye", result);
+    // IO
    saveResult(par().output, "HW_NonEye", result);
 }
--- a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
+++ b/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
@@ -104,7 +104,6 @@ void TWeakNeutral4ptDisc::execute(void)
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
    ResultWriter          writer(RESULT_FILE_NAME(par().output));
    auto                  &q1 = envGet(PropagatorField, par().q1);
    auto                  &q2 = envGet(PropagatorField, par().q2);
    auto                  &q3 = envGet(PropagatorField, par().q3);
@@ -138,5 +137,6 @@ void TWeakNeutral4ptDisc::execute(void)
    expbuf *= curr;
    MAKE_DIAG(expbuf, corrbuf, result[neut_disc_2_diag], "HW_disc0_2")
-    write(writer, "HW_disc0", result);
+    // IO
    saveResult(par().output, "HW_disc0", result);
 }
--- a/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
+++ b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
@@ -7,7 +7,9 @@ Source file: extras/Hadrons/Modules/MFermion/GaugeProp.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Lanny91 <andrew.lawson@gmail.com>
 Author: pretidav <david.preti@csic.es>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -94,7 +96,6 @@ private:
 };
 MODULE_REGISTER_NS(GaugeProp, TGaugeProp<FIMPL>, MFermion);
 /******************************************************************************
 *                      TGaugeProp implementation                             *
 ******************************************************************************/
@@ -154,7 +155,7 @@ void TGaugeProp<FImpl>::execute(void)
    LOG(Message) << "Inverting using solver '" << par().solver
                 << "' on source '" << par().source << "'" << std::endl;
    for (unsigned int s = 0; s < Ns; ++s)
-    for (unsigned int c = 0; c < Nc; ++c)
+      for (unsigned int c = 0; c < FImpl::Dimension; ++c)
    {
        LOG(Message) << "Inversion for spin= " << s << ", color= " << c
                     << std::endl;
@@ -163,11 +164,11 @@ void TGaugeProp<FImpl>::execute(void)
        {
            if (Ls_ == 1)
            {
-                PropToFerm(source, fullSrc, s, c);
+               PropToFerm<FImpl>(source, fullSrc, s, c);
            }
            else
            {
-                PropToFerm(tmp, fullSrc, s, c);
+                PropToFerm<FImpl>(tmp, fullSrc, s, c);
                make_5D(tmp, source, Ls_);
            }
        }
@@ -180,18 +181,18 @@ void TGaugeProp<FImpl>::execute(void)
            }
            else
            {
-                PropToFerm(source, fullSrc, s, c);
+                PropToFerm<FImpl>(source, fullSrc, s, c);
            }
        }
        sol = zero;
        solver(sol, source);
-        FermToProp(prop, sol, s, c);
+        FermToProp<FImpl>(prop, sol, s, c);
        // create 4D propagators from 5D one if necessary
        if (Ls_ > 1)
        {
            PropagatorField &p4d = envGet(PropagatorField, getName());
            make_4D(sol, tmp, Ls_);
-            FermToProp(p4d, tmp, s, c);
+            FermToProp<FImpl>(p4d, tmp, s, c);
        }
    }
 }
--- a/extras/Hadrons/Modules/MGauge/FundtoHirep.cc
+++ b/extras/Hadrons/Modules/MGauge/FundtoHirep.cc
@@ -0,0 +1,77 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MGauge/FundtoHirep.cc
 Copyright (C) 2015
 Copyright (C) 2016
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MGauge/FundtoHirep.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;
 // constructor /////////////////////////////////////////////////////////////////
 template <class Rep>
 TFundtoHirep<Rep>::TFundtoHirep(const std::string name)
 : Module<FundtoHirepPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <class Rep>
 std::vector<std::string> TFundtoHirep<Rep>::getInput(void)
 {
    std::vector<std::string> in = {par().gaugeconf};
    return in;
 }
 template <class Rep>
 std::vector<std::string> TFundtoHirep<Rep>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename Rep>
 void TFundtoHirep<Rep>::setup(void)
 {
    envCreateLat(Rep::LatticeField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <class Rep>
 void TFundtoHirep<Rep>::execute(void)
 {
    LOG(Message) << "Transforming Representation" << std::endl;
    auto &U    = envGet(LatticeGaugeField, par().gaugeconf);
    auto &URep = envGet(Rep::LatticeField, getName());
    Rep TargetRepresentation(U._grid);
    TargetRepresentation.update_representation(U);
    URep = TargetRepresentation.U;
 }
--- a/extras/Hadrons/Modules/MGauge/FundtoHirep.hpp
+++ b/extras/Hadrons/Modules/MGauge/FundtoHirep.hpp
@@ -0,0 +1,76 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MGauge/FundtoHirep.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: pretidav <david.preti@csic.es>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MGauge_FundtoHirep_hpp_
 #define Hadrons_MGauge_FundtoHirep_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Load a NERSC configuration                         *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 class FundtoHirepPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(FundtoHirepPar,
                                    std::string, gaugeconf);
 };
 template <class Rep>
 class TFundtoHirep: public Module<FundtoHirepPar>
 {
 public:
    // constructor
    TFundtoHirep(const std::string name);
    // destructor
    virtual ~TFundtoHirep(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    void setup(void);
    // execution
    void execute(void);
 };
 //MODULE_REGISTER_NS(FundtoAdjoint,   TFundtoHirep<AdjointRepresentation>, MGauge);
 //MODULE_REGISTER_NS(FundtoTwoIndexSym, TFundtoHirep<TwoIndexSymmetricRepresentation>, MGauge);
 //MODULE_REGISTER_NS(FundtoTwoIndexAsym, TFundtoHirep<TwoIndexAntiSymmetricRepresentation>, MGauge);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MGauge_FundtoHirep_hpp_
--- a/extras/Hadrons/Modules/MGauge/StochEm.cc
+++ b/extras/Hadrons/Modules/MGauge/StochEm.cc
@@ -57,9 +57,11 @@ std::vector<std::string> TStochEm::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 void TStochEm::setup(void)
 {
    create_weight = false; 
    if (!env().hasCreatedObject("_" + getName() + "_weight"))
    {
        envCacheLat(EmComp, "_" + getName() + "_weight");
        create_weight = true;
    }
    envCreateLat(EmField, getName());
 }
@@ -67,13 +69,13 @@ void TStochEm::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 void TStochEm::execute(void)
 {
-    LOG(Message) << "Generating stochatic EM potential..." << std::endl;
+    LOG(Message) << "Generating stochastic EM potential..." << std::endl;
    PhotonR photon(par().gauge, par().zmScheme);
    auto    &a = envGet(EmField, getName());
    auto    &w = envGet(EmComp, "_" + getName() + "_weight");
-    if (!env().hasCreatedObject("_" + getName() + "_weight"))
+    if (create_weight)
    {
        LOG(Message) << "Caching stochatic EM potential weight (gauge: "
                     << par().gauge << ", zero-mode scheme: "
--- a/extras/Hadrons/Modules/MGauge/StochEm.hpp
+++ b/extras/Hadrons/Modules/MGauge/StochEm.hpp
@@ -7,6 +7,7 @@ Source file: extras/Hadrons/Modules/MGauge/StochEm.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Vera Guelpers <vmg1n14@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -60,6 +61,8 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 private:
    bool create_weight;
 protected:
    // setup
    virtual void setup(void);
--- a/extras/Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp
+++ b/extras/Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp
@@ -0,0 +1,126 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MIO_LoadCoarseEigenPack_hpp_
 #define Hadrons_MIO_LoadCoarseEigenPack_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/EigenPack.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *              Load local coherence eigen vectors/values package             *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MIO)
 class LoadCoarseEigenPackPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadCoarseEigenPackPar,
                                    std::string, filestem,
                                    unsigned int, sizeFine,
                                    unsigned int, sizeCoarse,
                                    unsigned int, Ls,
                                    std::vector<int>, blockSize);
 };
 template <typename Pack>
 class TLoadCoarseEigenPack: public Module<LoadCoarseEigenPackPar>
 {
 public:
    typedef CoarseEigenPack<typename Pack::Field, typename Pack::CoarseField> BasePack;
 public:
    // constructor
    TLoadCoarseEigenPack(const std::string name);
    // destructor
    virtual ~TLoadCoarseEigenPack(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(LoadCoarseFermionEigenPack, 
    ARG(TLoadCoarseEigenPack<CoarseFermionEigenPack<FIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>>), MIO);
 /******************************************************************************
 *                 TLoadCoarseEigenPack implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename Pack>
 TLoadCoarseEigenPack<Pack>::TLoadCoarseEigenPack(const std::string name)
 : Module<LoadCoarseEigenPackPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename Pack>
 std::vector<std::string> TLoadCoarseEigenPack<Pack>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <typename Pack>
 std::vector<std::string> TLoadCoarseEigenPack<Pack>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename Pack>
 void TLoadCoarseEigenPack<Pack>::setup(void)
 {
    env().createGrid(par().Ls);
    env().createCoarseGrid(par().blockSize, par().Ls);
    envCreateDerived(BasePack, Pack, getName(), par().Ls, par().sizeFine,
                     par().sizeCoarse, env().getRbGrid(par().Ls), 
                     env().getCoarseGrid(par().blockSize, par().Ls));
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename Pack>
 void TLoadCoarseEigenPack<Pack>::execute(void)
 {
    auto &epack = envGetDerived(BasePack, Pack, getName());
    epack.read(par().filestem, vm().getTrajectory());
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MIO_LoadCoarseEigenPack_hpp_
--- a/extras/Hadrons/Modules/MIO/LoadEigenPack.hpp
+++ b/extras/Hadrons/Modules/MIO/LoadEigenPack.hpp
@@ -0,0 +1,121 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MIO/LoadEigenPack.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MIO_LoadEigenPack_hpp_
 #define Hadrons_MIO_LoadEigenPack_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/EigenPack.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                   Load eigen vectors/values package                        *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MIO)
 class LoadEigenPackPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadEigenPackPar,
                                    std::string, filestem,
                                    unsigned int, size,
                                    unsigned int, Ls);
 };
 template <typename Pack>
 class TLoadEigenPack: public Module<LoadEigenPackPar>
 {
 public:
    typedef EigenPack<typename Pack::Field> BasePack;
 public:
    // constructor
    TLoadEigenPack(const std::string name);
    // destructor
    virtual ~TLoadEigenPack(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(LoadFermionEigenPack, TLoadEigenPack<FermionEigenPack<FIMPL>>, MIO);
 /******************************************************************************
 *                    TLoadEigenPack implementation                           *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename Pack>
 TLoadEigenPack<Pack>::TLoadEigenPack(const std::string name)
 : Module<LoadEigenPackPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename Pack>
 std::vector<std::string> TLoadEigenPack<Pack>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <typename Pack>
 std::vector<std::string> TLoadEigenPack<Pack>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename Pack>
 void TLoadEigenPack<Pack>::setup(void)
 {
    env().createGrid(par().Ls);
    envCreateDerived(BasePack, Pack, getName(), par().Ls, par().size, 
                     env().getRbGrid(par().Ls));
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename Pack>
 void TLoadEigenPack<Pack>::execute(void)
 {
    auto &epack = envGetDerived(BasePack, Pack, getName());
    epack.read(par().filestem, vm().getTrajectory());
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MIO_LoadEigenPack_hpp_
--- a/extras/Hadrons/Modules/MIO/LoadNersc.cc
+++ b/extras/Hadrons/Modules/MIO/LoadNersc.cc
@@ -71,6 +71,4 @@ void TLoadNersc::execute(void)
    auto &U = envGet(LatticeGaugeField, getName());
    NerscIO::readConfiguration(U, header, fileName);
    LOG(Message) << "NERSC header:" << std::endl;
    dump_meta_data(header, LOG(Message));
 }
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -133,7 +133,6 @@ void TChargedProp::execute(void)
        LOG(Message) << "Saving zero-momentum projection to '"
                     << filename << "'..." << std::endl;
        ResultWriter          writer(RESULT_FILE_NAME(par().output));
        std::vector<TComplex> vecBuf;
        std::vector<Complex>  result;
@@ -143,8 +142,8 @@ void TChargedProp::execute(void)
        {
            result[t] = TensorRemove(vecBuf[t]);
        }
-        write(writer, "charge", q);
+        saveResult(par().output, "charge", q);
-        write(writer, "prop", result);
+        saveResult(par().output, "prop", result);
    }
 }
--- a/extras/Hadrons/Modules/MScalar/FreeProp.cc
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.cc
@@ -83,8 +83,6 @@ void TFreeProp::execute(void)
    if (!par().output.empty())
    {
        TextWriter            writer(par().output + "." +
                                     std::to_string(vm().getTrajectory()));
        std::vector<TComplex> buf;
        std::vector<Complex>  result;
@@ -94,6 +92,6 @@ void TFreeProp::execute(void)
        {
            result[t] = TensorRemove(buf[t]);
        }
-        write(writer, "prop", result);
+        saveResult(par().output, "freeprop", result);
    }
 }
--- a/extras/Hadrons/Modules/MScalarSUN/Div.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/Div.hpp
@@ -0,0 +1,154 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalarSUN/Div.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalarSUN_Div_hpp_
 #define Hadrons_MScalarSUN_Div_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/Utils.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                       Divergence of a vector field                         *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
 class DivPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(DivPar,
                                    std::vector<std::string>, op,
                                    DiffType,                 type,
                                    std::string,              output);
 };
 template <typename SImpl>
 class TDiv: public Module<DivPar>
 {
 public:
    typedef typename SImpl::Field        Field;
    typedef typename SImpl::ComplexField ComplexField;
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        DiffType, type,
                                        Complex,  value);
    };
 public:
    // constructor
    TDiv(const std::string name);
    // destructor
    virtual ~TDiv(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(DivSU2, TDiv<ScalarNxNAdjImplR<2>>, MScalarSUN);
 MODULE_REGISTER_NS(DivSU3, TDiv<ScalarNxNAdjImplR<3>>, MScalarSUN);
 MODULE_REGISTER_NS(DivSU4, TDiv<ScalarNxNAdjImplR<4>>, MScalarSUN);
 MODULE_REGISTER_NS(DivSU5, TDiv<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_NS(DivSU6, TDiv<ScalarNxNAdjImplR<6>>, MScalarSUN);
 /******************************************************************************
 *                           TDiv implementation                              *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
 TDiv<SImpl>::TDiv(const std::string name)
 : Module<DivPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename SImpl>
 std::vector<std::string> TDiv<SImpl>::getInput(void)
 {
    return par().op;
 }
 template <typename SImpl>
 std::vector<std::string> TDiv<SImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TDiv<SImpl>::setup(void)
 {
    if (par().op.size() != env().getNd())
    {
        HADRON_ERROR(Size, "the number of components differs from number of dimensions");
    }
    envCreateLat(ComplexField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TDiv<SImpl>::execute(void)
 {
    const auto nd = env().getNd();
    LOG(Message) << "Computing the " << par().type << " divergence of [";
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
        std::cout << par().op[mu] << ((mu == nd - 1) ? "]" : ", ");
    }
    std::cout << std::endl;
    auto &div = envGet(ComplexField, getName());
    div = zero;
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
        auto &op = envGet(ComplexField, par().op[mu]);
        dmuAcc(div, op, mu, par().type);
    }
    if (!par().output.empty())
    {
        Result       r;
        r.type  = par().type;
        r.value = TensorRemove(sum(div));
        saveResult(par().output, "div", r);
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalarSUN_Div_hpp_
--- a/extras/Hadrons/Modules/MScalarSUN/EMT.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/EMT.hpp
@@ -0,0 +1,181 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalarSUN/EMT.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalarSUN_EMT_hpp_
 #define Hadrons_MScalarSUN_EMT_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/Utils.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Energy-momentum tensor                             *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
 class EMTPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(EMTPar,
                                    std::string, kinetic,
                                    std::string, phiPow,
                                    std::string, improvement,
                                    double     , m2,
                                    double     , lambda,
                                    double     , g,
                                    double     , xi,
                                    std::string, output);
 };
 template <typename SImpl>
 class TEMT: public Module<EMTPar>
 {
 public:
    typedef typename SImpl::Field        Field;
    typedef typename SImpl::ComplexField ComplexField;
 public:
    // constructor
    TEMT(const std::string name);
    // destructor
    virtual ~TEMT(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(EMTSU2, TEMT<ScalarNxNAdjImplR<2>>, MScalarSUN);
 MODULE_REGISTER_NS(EMTSU3, TEMT<ScalarNxNAdjImplR<3>>, MScalarSUN);
 MODULE_REGISTER_NS(EMTSU4, TEMT<ScalarNxNAdjImplR<4>>, MScalarSUN);
 MODULE_REGISTER_NS(EMTSU5, TEMT<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_NS(EMTSU6, TEMT<ScalarNxNAdjImplR<6>>, MScalarSUN);
 /******************************************************************************
 *                           TEMT implementation                              *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
 TEMT<SImpl>::TEMT(const std::string name)
 : Module<EMTPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename SImpl>
 std::vector<std::string> TEMT<SImpl>::getInput(void)
 {
    std::vector<std::string> in;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
    {
        in.push_back(varName(par().kinetic, mu, nu));
        in.push_back(varName(par().improvement, mu, nu));
    }
    in.push_back(varName(par().phiPow, 2));
    in.push_back(varName(par().phiPow, 4));
    return in;
 }
 template <typename SImpl>
 std::vector<std::string> TEMT<SImpl>::getOutput(void)
 {
    std::vector<std::string> out;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
    {
        out.push_back(varName(getName(), mu, nu));
    }
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TEMT<SImpl>::setup(void)
 {
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
    {
        envCreateLat(ComplexField, varName(getName(), mu, nu));
    }
    envTmpLat(ComplexField, "sumkin");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TEMT<SImpl>::execute(void)
 {
    LOG(Message) << "Computing energy-momentum tensor" << std::endl;
    LOG(Message) << "  kinetic terms: '" << par().kinetic << "'" << std::endl;
    LOG(Message) << "      tr(phi^n): '" << par().phiPow << "'" << std::endl;
    LOG(Message) << "    improvement: '" << par().improvement << "'" << std::endl;
    LOG(Message) << "            m^2= " << par().m2 << std::endl;
    LOG(Message) << "         lambda= " << par().lambda << std::endl;
    LOG(Message) << "              g= " << par().g << std::endl;
    LOG(Message) << "             xi= " << par().xi << std::endl;
    const unsigned int N       = SImpl::Group::Dimension;
    auto               &trphi2 = envGet(ComplexField, varName(par().phiPow, 2));
    auto               &trphi4 = envGet(ComplexField, varName(par().phiPow, 4));
    envGetTmp(ComplexField, sumkin);
    sumkin = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        auto &trkin = envGet(ComplexField, varName(par().kinetic, mu, mu));
        sumkin += trkin;
    }
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
    {
        auto &out   = envGet(ComplexField, varName(getName(), mu, nu));
        auto &trkin = envGet(ComplexField, varName(par().kinetic, mu, nu));
        auto &imp   = envGet(ComplexField, varName(par().improvement, mu, nu));
        out = 2.*trkin + par().xi*imp;
        if (mu == nu)
        {
            out -= sumkin + par().m2*trphi2 + par().lambda*trphi4;
        }
        out *= N/par().g;
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalarSUN_EMT_hpp_
--- a/extras/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp
@@ -0,0 +1,169 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalarSUN_ShiftProbe_hpp_
 #define Hadrons_MScalarSUN_ShiftProbe_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/Utils.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *         Ward identity phi^n probe with fields at different positions       *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
 typedef std::pair<unsigned int, unsigned int> ShiftPair;
 class ShiftProbePar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ShiftProbePar,
                                    std::string, field,
                                    std::string, shifts,
                                    std::string, output);
 };
 template <typename SImpl>
 class TShiftProbe: public Module<ShiftProbePar>
 {
 public:
    typedef typename SImpl::Field                          Field;
    typedef typename SImpl::ComplexField                   ComplexField;
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::string, op,
                                        Complex    , value);
    };
 public:
    // constructor
    TShiftProbe(const std::string name);
    // destructor
    virtual ~TShiftProbe(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(ShiftProbeSU2, TShiftProbe<ScalarNxNAdjImplR<2>>, MScalarSUN);
 MODULE_REGISTER_NS(ShiftProbeSU3, TShiftProbe<ScalarNxNAdjImplR<3>>, MScalarSUN);
 MODULE_REGISTER_NS(ShiftProbeSU4, TShiftProbe<ScalarNxNAdjImplR<4>>, MScalarSUN);
 MODULE_REGISTER_NS(ShiftProbeSU5, TShiftProbe<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_NS(ShiftProbeSU6, TShiftProbe<ScalarNxNAdjImplR<6>>, MScalarSUN);
 /******************************************************************************
 *                        TShiftProbe implementation                          *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
 TShiftProbe<SImpl>::TShiftProbe(const std::string name)
 : Module<ShiftProbePar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename SImpl>
 std::vector<std::string> TShiftProbe<SImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().field};
    return in;
 }
 template <typename SImpl>
 std::vector<std::string> TShiftProbe<SImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TShiftProbe<SImpl>::setup(void)
 {
    envTmpLat(Field, "acc");
    envCreateLat(ComplexField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TShiftProbe<SImpl>::execute(void)
 {
    LOG(Message) << "Creating shift probe for shifts " << par().shifts
                 << std::endl;
    std::vector<ShiftPair> shift;
    unsigned int           sign;
    auto                   &phi   = envGet(Field, par().field);
    auto                   &probe = envGet(ComplexField, getName());
    shift = strToVec<ShiftPair>(par().shifts);
    if (shift.size() % 2 != 0)
    {
        HADRON_ERROR(Size, "the number of shifts is odd");
    }
    sign = (shift.size() % 4 == 0) ? 1 : -1;
    for (auto &s: shift)
    {
        if (s.first >= env().getNd())
        {
            HADRON_ERROR(Size, "dimension to large for shift <" 
                               + std::to_string(s.first) + " " 
                               + std::to_string(s.second) + ">" );
        }
    }
    envGetTmp(Field, acc);
    acc   = 1.;
    for (unsigned int i = 0; i < shift.size(); ++i)
    {
        if (shift[i].second == 0)
        {
            acc *= phi;
        }
        else
        {
            acc *= Cshift(phi, shift[i].first, shift[i].second);
        }
    }
    probe = sign*trace(acc);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalarSUN_ShiftProbe_hpp_
--- a/extras/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
@@ -0,0 +1,170 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalarSUN_TrKinetic_hpp_
 #define Hadrons_MScalarSUN_TrKinetic_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/Utils.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Trace of kinetic term                              *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
 class TrKineticPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TrKineticPar,
                                    std::string,  field,
                                    DiffType,     type,
                                    std::string,  output);
 };
 template <typename SImpl>
 class TTrKinetic: public Module<TrKineticPar>
 {
 public:
    typedef typename SImpl::Field        Field;
    typedef typename SImpl::ComplexField ComplexField;
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::string, op,
                                        Complex    , value);
    };
 public:
    // constructor
    TTrKinetic(const std::string name);
    // destructor
    virtual ~TTrKinetic(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(TrKineticSU2, TTrKinetic<ScalarNxNAdjImplR<2>>, MScalarSUN);
 MODULE_REGISTER_NS(TrKineticSU3, TTrKinetic<ScalarNxNAdjImplR<3>>, MScalarSUN);
 MODULE_REGISTER_NS(TrKineticSU4, TTrKinetic<ScalarNxNAdjImplR<4>>, MScalarSUN);
 MODULE_REGISTER_NS(TrKineticSU5, TTrKinetic<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_NS(TrKineticSU6, TTrKinetic<ScalarNxNAdjImplR<6>>, MScalarSUN);
 /******************************************************************************
 *                      TTrKinetic implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
 TTrKinetic<SImpl>::TTrKinetic(const std::string name)
 : Module<TrKineticPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename SImpl>
 std::vector<std::string> TTrKinetic<SImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().field};
    return in;
 }
 template <typename SImpl>
 std::vector<std::string> TTrKinetic<SImpl>::getOutput(void)
 {
    std::vector<std::string> out ;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
    {
        out.push_back(varName(getName(), mu, nu));
    }
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TTrKinetic<SImpl>::setup(void)
 {
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
    {
        envCreateLat(ComplexField, varName(getName(), mu, nu));
    }
    envTmp(std::vector<Field>, "der", 1, env().getNd(), env().getGrid());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TTrKinetic<SImpl>::execute(void)
 {
    LOG(Message) << "Computing tr(d_mu phi*d_nu phi) using " << par().type
                 << " derivative" << std::endl; 
    std::vector<Result> result;
    auto                &phi = envGet(Field, par().field);
    envGetTmp(std::vector<Field>, der);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        dmu(der[mu], phi, mu, par().type);
    }
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
    {
        auto &out = envGet(ComplexField, varName(getName(), mu, nu));
        out = -trace(der[mu]*der[nu]);
        if (!par().output.empty())
        {
            Result r;
            r.op    = "tr(d_" + std::to_string(mu) + "phi*d_" 
                      + std::to_string(nu) + "phi)";
            r.value = TensorRemove(sum(out));
            result.push_back(r);
        }
    }
    if (result.size() > 0)
    {
        saveResult(par().output, "trkinetic", result);
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalarSUN_TrKinetic_hpp_
--- a/extras/Hadrons/Modules/MScalarSUN/TrMag.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TrMag.hpp
@@ -31,11 +31,12 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/Utils.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
- *                       Module to compute tr(mag^n)                          *
+ *                     Trace of powers of the magnetisation                   *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
@@ -117,10 +118,9 @@ template <typename SImpl>
 void TTrMag<SImpl>::execute(void)
 {
    LOG(Message) << "Computing tr(mag^n) for n even up to " << par().maxPow
-                 << "..." << std::endl;
+                 << std::endl;
    std::vector<Result> result;
    ResultWriter        writer(RESULT_FILE_NAME(par().output));
    auto                &phi = envGet(Field, par().field);
    auto m2 = sum(phi), mn = m2;
@@ -136,7 +136,7 @@ void TTrMag<SImpl>::execute(void)
        r.value = TensorRemove(trace(mn)).real();
        result.push_back(r);
    }
-    write(writer, "trmag", result);
+    saveResult(par().output, "trmag", result);
 }
 END_MODULE_NAMESPACE
--- a/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
@@ -31,11 +31,12 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/Utils.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
- *                         Module to compute tr(phi^n)                        *
+ *                      Trace of powers of a scalar field                     *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
@@ -73,9 +74,6 @@ public:
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    // output name generator
    std::string outName(const unsigned int n);
 };
 MODULE_REGISTER_NS(TrPhiSU2, TTrPhi<ScalarNxNAdjImplR<2>>, MScalarSUN);
@@ -109,7 +107,7 @@ std::vector<std::string> TTrPhi<SImpl>::getOutput(void)
    for (unsigned int n = 2; n <= par().maxPow; n += 2)
    {
-        out.push_back(outName(n));
+        out.push_back(varName(getName(), n));
    }
    return out;
@@ -127,7 +125,7 @@ void TTrPhi<SImpl>::setup(void)
    envTmpLat(Field, "buf");
    for (unsigned int n = 2; n <= par().maxPow; n += 2)
    {
-        envCreateLat(ComplexField, outName(n));
+        envCreateLat(ComplexField, varName(getName(), n));
    }
 }
@@ -136,7 +134,7 @@ template <typename SImpl>
 void TTrPhi<SImpl>::execute(void)
 {
    LOG(Message) << "Computing tr(phi^n) for n even up to " << par().maxPow
-                 << "..." << std::endl; 
+                 << std::endl; 
    std::vector<Result> result;
    auto                &phi = envGet(Field, par().field);
@@ -147,7 +145,7 @@ void TTrPhi<SImpl>::execute(void)
    phi2 = -phi*phi; 
    for (unsigned int n = 2; n <= par().maxPow; n += 2)
    {
-        auto &phin = envGet(ComplexField, outName(n));
+        auto &phin = envGet(ComplexField, varName(getName(), n));
        buf  = buf*phi2;
        phin = trace(buf);
@@ -162,19 +160,10 @@ void TTrPhi<SImpl>::execute(void)
    }
    if (result.size() > 0)
    {
-        ResultWriter writer(RESULT_FILE_NAME(par().output));
+        saveResult(par().output, "trphi", result);
        write(writer, "trphi", result);
    }
 }
 // output name generator ///////////////////////////////////////////////////////
 template <typename SImpl>
 std::string TTrPhi<SImpl>::outName(const unsigned int n)
 {
    return getName() + "_" + std::to_string(n);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
--- a/extras/Hadrons/Modules/MScalarSUN/TransProj.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TransProj.hpp
@@ -0,0 +1,185 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalarSUN/TransProj.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalarSUN_TransProj_hpp_
 #define Hadrons_MScalarSUN_TransProj_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/Utils.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Transverse projection                              *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
 class TransProjPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TransProjPar,
                                    std::string,  op,
                                    DiffType,     type,
                                    std::string,  output);
 };
 template <typename SImpl>
 class TTransProj: public Module<TransProjPar>
 {
 public:
    typedef typename SImpl::Field        Field;
    typedef typename SImpl::ComplexField ComplexField;
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::string, op,
                                        Complex    , value);
    };
 public:
    // constructor
    TTransProj(const std::string name);
    // destructor
    virtual ~TTransProj(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(TransProjSU2, TTransProj<ScalarNxNAdjImplR<2>>, MScalarSUN);
 MODULE_REGISTER_NS(TransProjSU3, TTransProj<ScalarNxNAdjImplR<3>>, MScalarSUN);
 MODULE_REGISTER_NS(TransProjSU4, TTransProj<ScalarNxNAdjImplR<4>>, MScalarSUN);
 MODULE_REGISTER_NS(TransProjSU5, TTransProj<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_NS(TransProjSU6, TTransProj<ScalarNxNAdjImplR<6>>, MScalarSUN);
 /******************************************************************************
 *                        TTransProj implementation                           *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
 TTransProj<SImpl>::TTransProj(const std::string name)
 : Module<TransProjPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename SImpl>
 std::vector<std::string> TTransProj<SImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().op};
    return in;
 }
 template <typename SImpl>
 std::vector<std::string> TTransProj<SImpl>::getOutput(void)
 {
    std::vector<std::string> out;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
    {
        out.push_back(varName(getName(), mu, nu));
    }
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TTransProj<SImpl>::setup(void)
 {
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
    {
        envCreateLat(ComplexField, varName(getName(), mu, nu));
    }
    envTmpLat(ComplexField, "buf1");
    envTmpLat(ComplexField, "buf2");
    envTmpLat(ComplexField, "lap");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename SImpl>
 void TTransProj<SImpl>::execute(void)
 {
    LOG(Message) << "Computing (delta_mu,nu d^2 - d_mu*d_nu)*op using " 
                 << par().type << " derivatives and op= '" << par().op 
                 << "'" << std::endl; 
    std::vector<Result> result;
    auto                &op = envGet(ComplexField, par().op);
    envGetTmp(ComplexField, buf1);
    envGetTmp(ComplexField, buf2);
    envGetTmp(ComplexField, lap);
    lap = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        dmu(buf1, op, mu, par().type);
        dmu(buf2, buf1, mu, par().type);
        lap += buf2;
    }
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
    {
        auto &out = envGet(ComplexField, varName(getName(), mu, nu));
        dmu(buf1, op, mu, par().type);
        dmu(buf2, buf1, nu, par().type);
        out = -buf2;
        if (mu == nu)
        {
            out += lap;
        }
        if (!par().output.empty())
        {
            Result r;
            r.op    = "(delta_" + std::to_string(mu) + "," + std::to_string(nu)
                      + " d^2 - d_" + std::to_string(mu) + "*d_" 
                      + std::to_string(nu) + ")*op";
            r.value = TensorRemove(sum(out));
            result.push_back(r);
        }
    }
    if (result.size() > 0)
    {
        saveResult(par().output, "transproj", result);
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalarSUN_TransProj_hpp_
--- a/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
@@ -31,6 +31,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/Utils.hpp>
 BEGIN_HADRONS_NAMESPACE
@@ -87,7 +88,7 @@ MODULE_REGISTER_NS(TwoPointSU5, TTwoPoint<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_NS(TwoPointSU6, TTwoPoint<ScalarNxNAdjImplR<6>>, MScalarSUN);
 /******************************************************************************
- *                 TTwoPoint implementation                             *
+ *                       TTwoPoint implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
@@ -129,7 +130,6 @@ void TTwoPoint<SImpl>::execute(void)
        LOG(Message) << "  '" << o << "'" << std::endl;
    }
    ResultWriter        writer(RESULT_FILE_NAME(par().output));
    const unsigned int  nd = env().getDim().size();
    std::vector<Result> result;
@@ -150,7 +150,7 @@ void TTwoPoint<SImpl>::execute(void)
        r.data   = makeTwoPoint(slicedOp[i], slicedOp[j]);
        result.push_back(r);
    }
-    write(writer, "twopt", result);
+    saveResult(par().output, "twopt", result);
 }
 // make 2-pt function //////////////////////////////////////////////////////////
--- a/extras/Hadrons/Modules/MScalarSUN/Utils.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/Utils.hpp
@@ -0,0 +1,107 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MScalarSUN/Utils.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalarSUN_Utils_hpp_
 #define Hadrons_MScalarSUN_Utils_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 BEGIN_HADRONS_NAMESPACE
 BEGIN_MODULE_NAMESPACE(MScalarSUN)
 GRID_SERIALIZABLE_ENUM(DiffType, undef, forward, 1, backward, 2, central, 3);
 template <typename Field>
 inline void dmu(Field &out, const Field &in, const unsigned int mu, const DiffType type)
 {
    auto & env = Environment::getInstance();
    if (mu >= env.getNd())
    {
        HADRON_ERROR(Range, "Derivative direction out of range");
    }
    switch(type)
    {
        case DiffType::backward:
            out = in - Cshift(in, mu, -1);
            break;
        case DiffType::forward:
            out = Cshift(in, mu, 1) - in;
            break;
        case DiffType::central:
            out = 0.5*(Cshift(in, mu, 1) - Cshift(in, mu, -1));
            break;
        default:
            HADRON_ERROR(Argument, "Derivative type invalid");
            break;
    }
 }
 template <typename Field>
 inline void dmuAcc(Field &out, const Field &in, const unsigned int mu, const DiffType type)
 {
    auto & env = Environment::getInstance();
    if (mu >= env.getNd())
    {
        HADRON_ERROR(Range, "Derivative direction out of range");
    }
    switch(type)
    {
        case DiffType::backward:
            out += in - Cshift(in, mu, -1);
            break;
        case DiffType::forward:
            out += Cshift(in, mu, 1) - in;
            break;
        case DiffType::central:
            out += 0.5*(Cshift(in, mu, 1) - Cshift(in, mu, -1));
            break;
        default:
            HADRON_ERROR(Argument, "Derivative type invalid");
            break;
    }
 }
 inline std::string varName(const std::string name, const unsigned int mu)
 {
    return name + "_" + std::to_string(mu);
 }
 inline std::string varName(const std::string name, const unsigned int mu, 
                           const unsigned int nu)
 {
    return name + "_" + std::to_string(mu) + "_" + std::to_string(nu);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalarSUN_Utils_hpp_
--- a/extras/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
+++ b/extras/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
@@ -0,0 +1,184 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MSolver_LocalCoherenceLanczos_hpp_
 #define Hadrons_MSolver_LocalCoherenceLanczos_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/EigenPack.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                    Local coherence Lanczos eigensolver                     *
 *****************************************************************************/
 BEGIN_MODULE_NAMESPACE(MSolver)
 class LocalCoherenceLanczosPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosPar,
                                    std::string,   action,
                                    bool,          doCoarse,
                                    LanczosParams, fineParams,
                                    LanczosParams, coarseParams,
                                    ChebyParams,   smoother,
                                    RealD,         coarseRelaxTol,
                                    std::string,   blockSize,
                                    std::string,   output);
 };
 template <typename FImpl, int nBasis>
 class TLocalCoherenceLanczos: public Module<LocalCoherenceLanczosPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    typedef LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
                                  typename FImpl::SiteComplex, 
                                  nBasis>                LCL;
    typedef FermionEigenPack<FImpl>                      BasePack;
    typedef CoarseFermionEigenPack<FImpl, nBasis>        CoarsePack;
    typedef HADRONS_DEFAULT_SCHUR_OP<FMat, FermionField> SchurFMat;
 public:
    // constructor
    TLocalCoherenceLanczos(const std::string name);
    // destructor
    virtual ~TLocalCoherenceLanczos(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(LocalCoherenceLanczos, 
    ARG(TLocalCoherenceLanczos<FIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>), 
    MSolver);
 MODULE_REGISTER_NS(ZLocalCoherenceLanczos, 
    ARG(TLocalCoherenceLanczos<ZFIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>), 
    MSolver);
 /******************************************************************************
 *                 TLocalCoherenceLanczos implementation                      *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl, int nBasis>
 TLocalCoherenceLanczos<FImpl, nBasis>::TLocalCoherenceLanczos(const std::string name)
 : Module<LocalCoherenceLanczosPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl, int nBasis>
 std::vector<std::string> TLocalCoherenceLanczos<FImpl, nBasis>::getInput(void)
 {
    std::vector<std::string> in = {par().action};
    return in;
 }
 template <typename FImpl, int nBasis>
 std::vector<std::string> TLocalCoherenceLanczos<FImpl, nBasis>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl, int nBasis>
 void TLocalCoherenceLanczos<FImpl, nBasis>::setup(void)
 {
    LOG(Message) << "Setting up local coherence Lanczos eigensolver for"
                 << " action '" << par().action << "' (" << nBasis
                 << " eigenvectors)..." << std::endl;
    unsigned int Ls        = env().getObjectLs(par().action);
    auto         blockSize = strToVec<int>(par().blockSize);
    env().createCoarseGrid(blockSize, Ls);
    auto cg   = env().getCoarseGrid(blockSize, Ls);
    auto cgrb = env().getRbCoarseGrid(blockSize, Ls);
    int  cNm  = (par().doCoarse) ? par().coarseParams.Nm : 0;
    LOG(Message) << "Coarse grid: " << cg->GlobalDimensions() << std::endl;
    envCreateDerived(BasePack, CoarsePack, getName(), Ls,
                     par().fineParams.Nm, cNm, env().getRbGrid(Ls), cgrb);
    auto &epack = envGetDerived(BasePack, CoarsePack, getName());
    envTmp(SchurFMat, "mat", Ls, envGet(FMat, par().action));
    envGetTmp(SchurFMat, mat);
    envTmp(LCL, "solver", Ls, env().getRbGrid(Ls), cgrb, mat, 
           Odd, epack.evec, epack.evecCoarse, epack.eval, epack.evalCoarse);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl, int nBasis>
 void TLocalCoherenceLanczos<FImpl, nBasis>::execute(void)
 {
    auto &finePar   = par().fineParams;
    auto &coarsePar = par().coarseParams;
    auto &epack     = envGetDerived(BasePack, CoarsePack, getName());
    envGetTmp(LCL, solver);
    LOG(Message) << "Performing fine grid IRL -- Nstop= " 
                 << finePar.Nstop << ", Nk= " << finePar.Nk << ", Nm= " 
                 << finePar.Nm << std::endl;
    solver.calcFine(finePar.Cheby, finePar.Nstop, finePar.Nk, finePar.Nm,
                    finePar.resid,finePar.MaxIt, finePar.betastp, 
                    finePar.MinRes);
    solver.testFine(finePar.resid*100.0);
    if (par().doCoarse)
    {
        LOG(Message) << "Orthogonalising" << std::endl;
        solver.Orthogonalise();
        LOG(Message) << "Performing coarse grid IRL -- Nstop= " 
                    << coarsePar.Nstop << ", Nk= " << coarsePar.Nk << ", Nm= " 
                    << coarsePar.Nm << std::endl;
        solver.calcCoarse(coarsePar.Cheby, par().smoother, par().coarseRelaxTol,
                          coarsePar.Nstop, coarsePar.Nk, coarsePar.Nm, 
                          coarsePar.resid, coarsePar.MaxIt, coarsePar.betastp, 
                          coarsePar.MinRes);
        solver.testCoarse(coarsePar.resid*100.0, par().smoother, 
                        par().coarseRelaxTol);
    }
    if (!par().output.empty())
    {
        epack.write(par().output, vm().getTrajectory());
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MSolver_LocalCoherenceLanczos_hpp_
--- a/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
+++ b/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
@@ -32,6 +32,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/EigenPack.hpp>
 BEGIN_HADRONS_NAMESPACE
@@ -43,16 +44,25 @@ BEGIN_MODULE_NAMESPACE(MSolver)
 class RBPrecCGPar: Serializable
 {
 public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(RBPrecCGPar,
+    GRID_SERIALIZABLE_CLASS_MEMBERS(RBPrecCGPar ,
-                                    std::string, action,
+                                    std::string , action,
-                                    double     , residual);
+                                    unsigned int, maxIteration,
                                    double      , residual,
                                    std::string , eigenPack);
 };
-template <typename FImpl>
+template <typename FImpl, int nBasis>
 class TRBPrecCG: public Module<RBPrecCGPar>
 {
 public:
    FGS_TYPE_ALIASES(FImpl,);
    typedef FermionEigenPack<FImpl>                       EPack;
    typedef CoarseFermionEigenPack<FImpl, nBasis>         CoarseEPack;
    typedef std::shared_ptr<Guesser<FermionField>>        GuesserPt;
    typedef DeflatedGuesser<typename FImpl::FermionField> FineGuesser;
    typedef LocalCoherenceDeflatedGuesser<
        typename FImpl::FermionField,
        typename CoarseEPack::CoarseField> CoarseGuesser;
 public:
    // constructor
    TRBPrecCG(const std::string name);
@@ -69,36 +79,39 @@ protected:
    virtual void execute(void);
 };
-MODULE_REGISTER_NS(RBPrecCG, TRBPrecCG<FIMPL>, MSolver);
+MODULE_REGISTER_NS(RBPrecCG,  
    ARG(TRBPrecCG<FIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>), MSolver);
 MODULE_REGISTER_NS(ZRBPrecCG, 
    ARG(TRBPrecCG<ZFIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>), MSolver);
 /******************************************************************************
 *                      TRBPrecCG template implementation                     *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
-template <typename FImpl>
+template <typename FImpl, int nBasis>
-TRBPrecCG<FImpl>::TRBPrecCG(const std::string name)
+TRBPrecCG<FImpl, nBasis>::TRBPrecCG(const std::string name)
 : Module(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
-template <typename FImpl>
+template <typename FImpl, int nBasis>
-std::vector<std::string> TRBPrecCG<FImpl>::getInput(void)
+std::vector<std::string> TRBPrecCG<FImpl, nBasis>::getInput(void)
 {
    std::vector<std::string> in = {};
    return in;
 }
-template <typename FImpl>
+template <typename FImpl, int nBasis>
-std::vector<std::string> TRBPrecCG<FImpl>::getReference(void)
+std::vector<std::string> TRBPrecCG<FImpl, nBasis>::getReference(void)
 {
    std::vector<std::string> ref = {par().action};
    return ref;
 }
-template <typename FImpl>
+template <typename FImpl, int nBasis>
-std::vector<std::string> TRBPrecCG<FImpl>::getOutput(void)
+std::vector<std::string> TRBPrecCG<FImpl, nBasis>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
@@ -106,28 +119,60 @@ std::vector<std::string> TRBPrecCG<FImpl>::getOutput(void)
 }
 // setup ///////////////////////////////////////////////////////////////////////
-template <typename FImpl>
+template <typename FImpl, int nBasis>
-void TRBPrecCG<FImpl>::setup(void)
+void TRBPrecCG<FImpl, nBasis>::setup(void)
 {
    if (par().maxIteration == 0)
    {
        HADRON_ERROR(Argument, "zero maximum iteration");
    }
    LOG(Message) << "setting up Schur red-black preconditioned CG for"
                 << " action '" << par().action << "' with residual "
-                 << par().residual << std::endl;
+                 << par().residual << ", maximum iteration " 
                 << par().maxIteration << std::endl;
-    auto Ls     = env().getObjectLs(par().action);
+    auto        Ls          = env().getObjectLs(par().action);
-    auto &mat   = envGet(FMat, par().action);
+    auto        &mat        = envGet(FMat, par().action);
-    auto solver = [&mat, this](FermionField &sol, const FermionField &source)
+    std::string guesserName = getName() + "_guesser";
    GuesserPt   guesser{nullptr};
    if (par().eigenPack.empty())
    {
-        ConjugateGradient<FermionField>           cg(par().residual, 10000);
+        guesser.reset(new ZeroGuesser<FermionField>());
-        SchurRedBlackDiagMooeeSolve<FermionField> schurSolver(cg);
+    }
    else
    {
        try
        {
            auto &epack = envGetDerived(EPack, CoarseEPack, par().eigenPack);
            guesser.reset(new CoarseGuesser(epack.evec, epack.evecCoarse,
                                            epack.evalCoarse));
        }
        catch (Exceptions::Definition &e)
        {
            auto &epack = envGet(EPack, par().eigenPack);
            guesser.reset(new FineGuesser(epack.evec, epack.eval));
        }
    }
    auto solver = [&mat, guesser, this](FermionField &sol, 
                                        const FermionField &source)
    {
        ConjugateGradient<FermionField>           cg(par().residual, 
                                                     par().maxIteration);
        HADRONS_DEFAULT_SCHUR_SOLVE<FermionField> schurSolver(cg);
-        schurSolver(mat, source, sol);
+        schurSolver(mat, source, sol, *guesser);
    };
    envCreate(SolverFn, getName(), Ls, solver);
 }
 // execution ///////////////////////////////////////////////////////////////////
-template <typename FImpl>
+template <typename FImpl, int nBasis>
-void TRBPrecCG<FImpl>::execute(void)
+void TRBPrecCG<FImpl, nBasis>::execute(void)
 {}
 END_MODULE_NAMESPACE
--- a/extras/Hadrons/Modules/MSource/SeqConserved.hpp
+++ b/extras/Hadrons/Modules/MSource/SeqConserved.hpp
@@ -8,6 +8,7 @@ Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 Author: Vera Guelpers <vmg1n14@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -38,9 +39,11 @@ BEGIN_HADRONS_NAMESPACE
 /*
- Sequential source
+ Sequential source with insertion of conserved current. 
 Additionally optional insertion of a photon field A_\mu(x).
 -----------------------------
- * src_x = q_x * theta(x_3 - tA) * theta(tB - x_3) * J_mu * exp(i x.mom)
+ * src_x = sum_{mu=mu_min}^{mu_max} 
     q_x * theta(x_3 - tA) * theta(tB - x_3) * J_mu * exp(i x.mom) (* A_\mu(x))
 * options:
 - q: input propagator (string)
@@ -48,8 +51,10 @@ BEGIN_HADRONS_NAMESPACE
 - tA: begin timeslice (integer)
 - tB: end timesilce (integer)
 - curr_type: type of conserved current to insert (Current)
- - mu: Lorentz index of current to insert (integer)
+ - mu_min: begin Lorentz Index (integer)
 - mu_max: end Lorentz Index (integer)
 - mom: momentum insertion, space-separated float sequence (e.g ".1 .2 1. 0.")
 - photon: optional photon field (string)
 */
@@ -67,8 +72,10 @@ public:
                                    unsigned int, tA,
                                    unsigned int, tB,
                                    Current,      curr_type,
-                                    unsigned int, mu,
+                                    unsigned int, mu_min,
-                                    std::string,  mom);
+                                    unsigned int, mu_max,
                                    std::string,  mom,
                                    std::string,  photon);
 };
 template <typename FImpl>
@@ -76,6 +83,8 @@ class TSeqConserved: public Module<SeqConservedPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    typedef PhotonR::GaugeField     EmField;
 public:
    // constructor
    TSeqConserved(const std::string name);
@@ -89,10 +98,14 @@ protected:
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    bool        SeqhasPhase_{false}; 
    std::string SeqmomphName_;
 };
 MODULE_REGISTER_NS(SeqConserved, TSeqConserved<FIMPL>, MSource);
 /******************************************************************************
 *                      TSeqConserved implementation                          *
 ******************************************************************************/
@@ -100,6 +113,7 @@ MODULE_REGISTER_NS(SeqConserved, TSeqConserved<FIMPL>, MSource);
 template <typename FImpl>
 TSeqConserved<FImpl>::TSeqConserved(const std::string name)
 : Module<SeqConservedPar>(name)
 , SeqmomphName_ (name + "_Seqmomph")
 {}
 // dependencies/products ///////////////////////////////////////////////////////
@@ -107,7 +121,8 @@ template <typename FImpl>
 std::vector<std::string> TSeqConserved<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().action};
-    
+    if (!par().photon.empty()) in.push_back(par().photon);
    return in;
 }
@@ -116,7 +131,7 @@ std::vector<std::string> TSeqConserved<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
-    return out;
+   return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
@@ -125,6 +140,10 @@ void TSeqConserved<FImpl>::setup(void)
 {
    auto Ls_ = env().getObjectLs(par().action);
    envCreateLat(PropagatorField, getName(), Ls_);
    envTmpLat(PropagatorField, "src_tmp");
    envCacheLat(LatticeComplex, SeqmomphName_);
    envTmpLat(LatticeComplex, "coor");
    envTmpLat(LatticeComplex, "latt_compl");
 }
 // execution ///////////////////////////////////////////////////////////////////
@@ -134,27 +153,79 @@ void TSeqConserved<FImpl>::execute(void)
    if (par().tA == par().tB)
    {
        LOG(Message) << "Generating sequential source with conserved "
-                     << par().curr_type << " current insertion (mu = " 
+                     << par().curr_type << " current at " 
-                     << par().mu << ") at " << "t = " << par().tA << std::endl;
+		     << "t = " << par().tA << " summed over the indices " 
 		     << par().mu_min << " <= mu <= " << par().mu_max 
 		     << std::endl;
    }
    else
    {
        LOG(Message) << "Generating sequential source with conserved "
-                     << par().curr_type << " current insertion (mu = " 
+                     << par().curr_type << " current for " 
-                     << par().mu << ") for " << par().tA << " <= t <= " 
+                     << par().tA << " <= t <= " 
-                     << par().tB << std::endl;
+                     << par().tB << " summed over the indices " 
 		     << par().mu_min << " <= mu <= " << par().mu_max
 	             << std::endl;
    }
    auto &src = envGet(PropagatorField, getName());
    envGetTmp(PropagatorField, src_tmp);
    src_tmp = src;
    auto &q   = envGet(PropagatorField, par().q);
    auto &mat = envGet(FMat, par().action);
    envGetTmp(LatticeComplex, latt_compl);
-    std::vector<Real> mom = strToVec<Real>(par().mom);
+    src = zero;
-    mat.SeqConservedCurrent(q, src, par().curr_type, par().mu, 
+
-                            mom, par().tA, par().tB);
+    //exp(ipx)
    auto &mom_phase = envGet(LatticeComplex, SeqmomphName_);
    if (!SeqhasPhase_)
    {    
        std::vector<Real> mom = strToVec<Real>(par().mom);
        mom_phase = zero;
        Complex           i(0.0,1.0);
        envGetTmp(LatticeComplex, coor);
        for(unsigned int mu = 0; mu < env().getNd(); mu++)
        {
            LatticeCoordinate(coor, mu);
            mom_phase = mom_phase + (mom[mu]/env().getGrid()->_fdimensions[mu])*coor;
        }
        mom_phase = exp((Real)(2*M_PI)*i*mom_phase);
        SeqhasPhase_ = true;
    }
    LOG(Message) << "Inserting momentum " << strToVec<Real>(par().mom) << std::endl;
    if (!par().photon.empty())    	
    {
 	 LOG(Message) << "Inserting the stochastic photon field " << par().photon << std::endl;
    }
    for(unsigned int mu=par().mu_min;mu<=par().mu_max;mu++)
    {
        if (!par().photon.empty())    	
        {
 	    //Get the stochastic photon field, if required
            auto &stoch_photon = envGet(EmField,  par().photon);
    	    latt_compl =  PeekIndex<LorentzIndex>(stoch_photon, mu) * mom_phase;
        }
        else
        {
            latt_compl = mom_phase;
        } 
    	mat.SeqConservedCurrent(q, src_tmp, par().curr_type, mu, 
                             par().tA, par().tB, latt_compl);
 	src += src_tmp;
    }	
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_SeqConserved_hpp_
+#endif // Hadrons_MSource_SeqConserved_hpp_
--- a/extras/Hadrons/VirtualMachine.cc
+++ b/extras/Hadrons/VirtualMachine.cc
@@ -111,6 +111,7 @@ void VirtualMachine::pushModule(VirtualMachine::ModPt &pt)
            {
                // output does not exists, add it
                env().addObject(out, address);
                module_[address].output.push_back(env().getObjectAddress(out));
            }
            else
            {
@@ -296,12 +297,65 @@ void VirtualMachine::makeModuleGraph(void)
    {
        for (auto &in: module_[m].input)
        {
-            graph.addEdge(env().getObjectModule(in), m);
+            int min = env().getObjectModule(in);
            if (min < 0)
            {
                HADRON_ERROR(Definition, "object with address " 
                             + std::to_string(in) 
                             + " is not produced by any module");
            }
            else
            {
                graph.addEdge(min, m);
            }
        }
    }
    graph_ = graph;
 }
 // dump GraphViz graph /////////////////////////////////////////////////////////
 void VirtualMachine::dumpModuleGraph(std::ostream &out)
 {
    makeModuleGraph();
    out << "digraph hadrons {" << std::endl;
    out << "node [shape=record, fontname=\"Courier\", fontsize=\"11\"];" << std::endl;
    out << "graph [fontname = \"Courier\", fontsize=\"11\"];" << std::endl;
    out << "edge [fontname = \"Courier\", fontsize=\"11\"];"<< std::endl;
    for (unsigned int m = 0; m < module_.size(); ++m)
    {
    }
    for (unsigned int m = 0; m < module_.size(); ++m)
    {
        for (auto &in: module_[m].input)
        {
            int min = env().getObjectModule(in);
            out << min << " -> " << m << " [ label = \""
                << env().getObjectName(in) << "\" ];" << std::endl;
        }
    }
    for (unsigned int m = 0; m < module_.size(); ++m)
    {
        out <<  m << " [ label = \"{<f0> " << getModule(m)->getRegisteredName()
            << " |<f1> " << getModuleName(m) << "}\" ];" << std::endl;
    }
    out << "}\n" << std::endl;
 }
 void VirtualMachine::dumpModuleGraph(void)
 {
    dumpModuleGraph(std::cout);
 }
 void VirtualMachine::dumpModuleGraph(const std::string filename)
 {
    std::ofstream f(filename);
    dumpModuleGraph(f);
 }
 // memory profile //////////////////////////////////////////////////////////////
 const VirtualMachine::MemoryProfile & VirtualMachine::getMemoryProfile(void)
 {
@@ -327,7 +381,6 @@ void VirtualMachine::makeMemoryProfile(void)
    env().protectObjects(false);
    GridLogMessage.Active(false);
    HadronsLogMessage.Active(false);
    HadronsLogError.Active(false);
    for (auto it = program.rbegin(); it != program.rend(); ++it) 
    {
        auto a = *it;
@@ -343,7 +396,6 @@ void VirtualMachine::makeMemoryProfile(void)
    env().protectObjects(protect);
    GridLogMessage.Active(gmsg);
    HadronsLogMessage.Active(hmsg);
    HadronsLogError.Active(err);
    LOG(Debug) << "Memory profile:" << std::endl;
    LOG(Debug) << "----------------" << std::endl;
    for (unsigned int a = 0; a < profile_.module.size(); ++a)
@@ -424,11 +476,17 @@ void VirtualMachine::memoryProfile(const unsigned int address)
        cleanEnvironment();
        for (auto &in: m->getInput())
        {
-            memoryProfile(env().getObjectModule(in));
+            if (!env().hasCreatedObject(in))
            {
                memoryProfile(env().getObjectModule(in));
            }
        }
        for (auto &ref: m->getReference())
        {
-            memoryProfile(env().getObjectModule(ref));
+            if (!env().hasCreatedObject(ref))
            {
                memoryProfile(env().getObjectModule(ref));
            }
        }
        m->setup();
        updateProfile(address);
@@ -532,7 +590,7 @@ VirtualMachine::Program VirtualMachine::schedule(const GeneticPar &par)
    gen = 0;
    do
    {
-        LOG(Debug) << "Generation " << gen << ":" << std::endl;
+        //LOG(Debug) << "Generation " << gen << ":" << std::endl;
        scheduler.nextGeneration();
        if (gen != 0)
        {
@@ -572,6 +630,17 @@ void VirtualMachine::executeProgram(const Program &p) const
    // build garbage collection schedule
    LOG(Debug) << "Building garbage collection schedule..." << std::endl;
    freeProg = makeGarbageSchedule(p);
    for (unsigned int i = 0; i < freeProg.size(); ++i)
    {
        std::string msg = "";
        for (auto &a: freeProg[i])
        {
            msg += env().getObjectName(a) + " ";
        }
        msg += "]";
        LOG(Debug) << std::setw(4) << i + 1 << ": [" << msg << std::endl;
    }
    // program execution
    LOG(Debug) << "Executing program..." << std::endl;
--- a/extras/Hadrons/VirtualMachine.hpp
+++ b/extras/Hadrons/VirtualMachine.hpp
@@ -84,7 +84,7 @@ private:
        const std::type_info      *type{nullptr};
        std::string               name;
        ModPt                     data{nullptr};
-        std::vector<unsigned int> input;
+        std::vector<unsigned int> input, output;
        size_t                    maxAllocated;
    };
 public:
@@ -120,6 +120,10 @@ public:
    void                printContent(void) const;
    // module graph (could be a const reference if topoSort was const)
    Graph<unsigned int> getModuleGraph(void);
    // dump GraphViz graph
    void                dumpModuleGraph(std::ostream &out);
    void                dumpModuleGraph(void);
    void                dumpModuleGraph(const std::string filename);
    // memory profile
    const MemoryProfile &getMemoryProfile(void);
    // garbage collector
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@@ -5,6 +5,7 @@ modules_cc =\
  Modules/MGauge/Unit.cc \
  Modules/MGauge/StochEm.cc \
  Modules/MGauge/Random.cc \
  Modules/MGauge/FundtoHirep.cc \
  Modules/MScalar/FreeProp.cc \
  Modules/MScalar/ChargedProp.cc \
  Modules/MIO/LoadNersc.cc
@@ -27,9 +28,11 @@ modules_hpp =\
  Modules/MSource/SeqConserved.hpp \
  Modules/MSink/Smear.hpp \
  Modules/MSink/Point.hpp \
  Modules/MSolver/LocalCoherenceLanczos.hpp \
  Modules/MSolver/RBPrecCG.hpp \
  Modules/MGauge/Unit.hpp \
  Modules/MGauge/Random.hpp \
  Modules/MGauge/FundtoHirep.hpp \
  Modules/MGauge/StochEm.hpp \
  Modules/MUtilities/TestSeqGamma.hpp \
  Modules/MUtilities/TestSeqConserved.hpp \
@@ -39,9 +42,19 @@ modules_hpp =\
  Modules/MScalar/ChargedProp.hpp \
  Modules/MAction/DWF.hpp \
  Modules/MAction/Wilson.hpp \
  Modules/MAction/WilsonClover.hpp \
  Modules/MAction/ZMobiusDWF.hpp \
  Modules/MScalarSUN/ShiftProbe.hpp \
  Modules/MScalarSUN/Div.hpp \
  Modules/MScalarSUN/TrMag.hpp \
  Modules/MScalarSUN/EMT.hpp \
  Modules/MScalarSUN/TwoPoint.hpp \
  Modules/MScalarSUN/TrPhi.hpp \
  Modules/MScalarSUN/Utils.hpp \
  Modules/MScalarSUN/TransProj.hpp \
  Modules/MScalarSUN/TrKinetic.hpp \
  Modules/MIO/LoadEigenPack.hpp \
  Modules/MIO/LoadNersc.hpp \
  Modules/MIO/LoadCoarseEigenPack.hpp \
  Modules/MIO/LoadBinary.hpp
--- a/lib/algorithms/Algorithms.h
+++ b/lib/algorithms/Algorithms.h
@@ -39,6 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
 #include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -183,11 +183,13 @@ namespace Grid {
      virtual  RealD Mpc      (const Field &in, Field &out) =0;
      virtual  RealD MpcDag   (const Field &in, Field &out) =0;
      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
-	Field tmp(in._grid);
+      Field tmp(in._grid);
      tmp.checkerboard = in.checkerboard;
 	ni=Mpc(in,tmp);
 	no=MpcDag(tmp,out);
      }
      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
      out.checkerboard = in.checkerboard;
 	MpcDagMpc(in,out,n1,n2);
      }
      virtual void HermOp(const Field &in, Field &out){
@@ -215,13 +217,15 @@ namespace Grid {
    public:
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
-	Field tmp(in._grid);
+      Field tmp(in._grid);
-//	std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
+      tmp.checkerboard = !in.checkerboard;
 	//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);
      //std::cout << "cb in " << in.checkerboard << "  cb out " << out.checkerboard << std::endl;
 	_Mat.Mooee(in,out);
 	return axpy_norm(out,-1.0,tmp,out);
      }
--- a/lib/algorithms/iterative/Deflation.h
+++ b/lib/algorithms/iterative/Deflation.h
@@ -0,0 +1,110 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_DEFLATION_H
 #define GRID_DEFLATION_H
 namespace Grid { 
 template<class Field>
 class Guesser {
 public:
  Guesser(void) = default;
  virtual ~Guesser(void) = default;
  virtual void operator()(const Field &src, Field &guess) = 0;
 };
 template<class Field>
 class ZeroGuesser: public Guesser<Field> {
 public:
  virtual void operator()(const Field &src, Field &guess) { guess = zero; };
 };
 template<class Field>
 class SourceGuesser: public Guesser<Field> {
 public:
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };
 ////////////////////////////////
 // Fine grid deflation
 ////////////////////////////////
 template<class Field>
 class DeflatedGuesser: public Guesser<Field> {
 private:
  const std::vector<Field> &evec;
  const std::vector<RealD> &eval;
 public:
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
  virtual void operator()(const Field &src,Field &guess) { 
    guess = zero;
    assert(evec.size()==eval.size());
    auto N = evec.size();
    for (int i=0;i<N;i++) {
      const Field& tmp = evec[i];
      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
    }
  }
 };
 template<class FineField, class CoarseField>
 class LocalCoherenceDeflatedGuesser: public Guesser<FineField> {
 private:
  const std::vector<FineField>   &subspace;
  const std::vector<CoarseField> &evec_coarse;
  const std::vector<RealD>       &eval_coarse;
 public:
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
    : subspace(_subspace), 
      evec_coarse(_evec_coarse), 
      eval_coarse(_eval_coarse)  
  {
  }
  void operator()(const FineField &src,FineField &guess) { 
    int N = (int)evec_coarse.size();
    CoarseField src_coarse(evec_coarse[0]._grid);
    CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero;
    blockProject(src_coarse,src,subspace);    
    for (int i=0;i<N;i++) {
      const CoarseField & tmp = evec_coarse[i];
      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
    }
    blockPromote(guess_coarse,guess,subspace);
  };
 };
 }
 #endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -149,19 +149,6 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
  basisReorderInPlace(_v,sort_vals,idx);
 }
 // PAB: faster to compute the inner products first then fuse loops.
 // If performance critical can improve.
 template<class Field>
 void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
  result = zero;
  assert(_v.size()==eval.size());
  int N = (int)_v.size();
  for (int i=0;i<N;i++) {
    Field& tmp = _v[i];
    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
  }
 }
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
@@ -181,6 +168,7 @@ enum IRLdiagonalisation {
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
 public:
  LinearFunction<Field>       &_HermOp;
  ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
  int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
@@ -243,6 +231,7 @@ class ImplicitlyRestartedLanczos {
  /////////////////////////
 public:       
  //////////////////////////////////////////////////////////////////
  // PAB:
  //////////////////////////////////////////////////////////////////
--- a/lib/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/lib/algorithms/iterative/LocalCoherenceLanczos.h
@@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #ifndef GRID_LOCAL_COHERENCE_IRL_H
 #define GRID_LOCAL_COHERENCE_IRL_H
 namespace Grid { 
 struct LanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
@@ -70,21 +73,24 @@ public:
  typedef Lattice<Fobj>          FineField;
  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;
-  ProjectedHermOp(LinearOperatorBase<FineField>& linop,  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
+  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
-    _Linop(linop),
+    _Linop(linop), subspace(_subspace)
-    _Aggregate(aggregate)  {  };
+  {  
    assert(subspace.size() >0);
  };
  void operator()(const CoarseField& in, CoarseField& out) {
    GridBase *FineGrid = subspace[0]._grid;    
    int   checkerboard = subspace[0].checkerboard;
    FineField fin (FineGrid);     fin.checkerboard= checkerboard;
    FineField fout(FineGrid);   fout.checkerboard = checkerboard;
-    GridBase *FineGrid = _Aggregate.FineGrid;
+    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
-    FineField fin(FineGrid);
+    _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
-    FineField fout(FineGrid);
+    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
    _Linop.HermOp(fin,fout);                   std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
  }
 };
@@ -99,24 +105,27 @@ public:
  OperatorFunction<FineField>   & _poly;
  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;
-  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop, 
+  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
-			  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
+			  LinearOperatorBase<FineField>& linop, 
 			  std::vector<FineField> & _subspace) :
    _poly(poly),
    _Linop(linop),
-    _Aggregate(aggregate)  {  };
+    subspace(_subspace)
  {  };
  void operator()(const CoarseField& in, CoarseField& out) {
    GridBase *FineGrid = _Aggregate.FineGrid;
    FineField fin(FineGrid) ;fin.checkerboard  =_Aggregate.checkerboard;
    FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard;
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
+    GridBase *FineGrid = subspace[0]._grid;    
    int   checkerboard = subspace[0].checkerboard;
    FineField fin (FineGrid); fin.checkerboard =checkerboard;
    FineField fout(FineGrid);fout.checkerboard =checkerboard;
    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
    _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
+    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
  }
 };
@@ -132,19 +141,23 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
  LinearFunction<CoarseField> & _Poly;
  OperatorFunction<FineField>   & _smoother;
  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  RealD                          _coarse_relax_tol;
-  RealD                             _coarse_relax_tol;
+  std::vector<FineField>        &_subspace;
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
-					   Aggregation<Fobj,CComplex,nbasis> &Aggregate,
+					   std::vector<FineField>        &subspace,
 					   RealD coarse_relax_tol=5.0e3) 
-    : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol)  {    };
+    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
      _coarse_relax_tol(coarse_relax_tol)  
  {    };
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
    RealD eval_poly = eval;
    // Apply operator
    _Poly(B,v);
@@ -168,14 +181,13 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
  }
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
-    GridBase *FineGrid = _Aggregate.FineGrid;
+    GridBase *FineGrid = _subspace[0]._grid;    
-
+    int checkerboard   = _subspace[0].checkerboard;
    int checkerboard   = _Aggregate.checkerboard;
    FineField fB(FineGrid);fB.checkerboard =checkerboard;
    FineField fv(FineGrid);fv.checkerboard =checkerboard;
-    _Aggregate.PromoteFromSubspace(B,fv);
+    blockPromote(B,fv,_subspace);  
    _smoother(_Linop,fv,fB); 
    RealD eval_poly = eval;
@@ -217,27 +229,65 @@ protected:
  int _checkerboard;
  LinearOperatorBase<FineField>                 & _FineOp;
-  // FIXME replace Aggregation with vector of fine; the code reuse is too small for
+  std::vector<RealD>                              &evals_fine;
-  // the hassle and complexity of cross coupling.
+  std::vector<RealD>                              &evals_coarse; 
-  Aggregation<Fobj,CComplex,nbasis>               _Aggregate;  
+  std::vector<FineField>                          &subspace;
-  std::vector<RealD>                              evals_fine;
+  std::vector<CoarseField>                        &evec_coarse;
-  std::vector<RealD>                              evals_coarse; 
+
-  std::vector<CoarseField>                        evec_coarse;
+private:
  std::vector<RealD>                              _evals_fine;
  std::vector<RealD>                              _evals_coarse; 
  std::vector<FineField>                          _subspace;
  std::vector<CoarseField>                        _evec_coarse;
 public:
  LocalCoherenceLanczos(GridBase *FineGrid,
-		GridBase *CoarseGrid,
+			GridBase *CoarseGrid,
-		LinearOperatorBase<FineField> &FineOp,
+			LinearOperatorBase<FineField> &FineOp,
-		int checkerboard) :
+			int checkerboard) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
    _Aggregate(CoarseGrid,FineGrid,checkerboard),
    _FineOp(FineOp),
-    _checkerboard(checkerboard)
+    _checkerboard(checkerboard),
    evals_fine  (_evals_fine),
    evals_coarse(_evals_coarse),
    subspace    (_subspace),
    evec_coarse(_evec_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
-  void Orthogonalise(void ) { _Aggregate.Orthogonalise(); }
+  //////////////////////////////////////////////////////////////////////////
  // Alternate constructore, external storage for use by Hadrons module
  //////////////////////////////////////////////////////////////////////////
  LocalCoherenceLanczos(GridBase *FineGrid,
 			GridBase *CoarseGrid,
 			LinearOperatorBase<FineField> &FineOp,
 			int checkerboard,
 			std::vector<FineField>   &ext_subspace,
 			std::vector<CoarseField> &ext_coarse,
 			std::vector<RealD>       &ext_eval_fine,
 			std::vector<RealD>       &ext_eval_coarse
 			) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
    _FineOp(FineOp),
    _checkerboard(checkerboard),
    evals_fine  (ext_eval_fine), 
    evals_coarse(ext_eval_coarse),
    subspace    (ext_subspace),
    evec_coarse (ext_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid); 
    blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
  };
  template<typename T>  static RealD normalise(T& v) 
  {
@@ -246,43 +296,44 @@ public:
    v = v * (1.0/nn);
    return nn;
  }
-
+  /*
  void fakeFine(void)
  {
    int Nk = nbasis;
-    _Aggregate.subspace.resize(Nk,_FineGrid);
+    subspace.resize(Nk,_FineGrid);
-    _Aggregate.subspace[0]=1.0;
+    subspace[0]=1.0;
-    _Aggregate.subspace[0].checkerboard=_checkerboard;
+    subspace[0].checkerboard=_checkerboard;
-    normalise(_Aggregate.subspace[0]);
+    normalise(subspace[0]);
    PlainHermOp<FineField>    Op(_FineOp);
    for(int k=1;k<Nk;k++){
-      _Aggregate.subspace[k].checkerboard=_checkerboard;
+      subspace[k].checkerboard=_checkerboard;
-      Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]);
+      Op(subspace[k-1],subspace[k]);
-      normalise(_Aggregate.subspace[k]);
+      normalise(subspace[k]);
    }
  }
  */
  void testFine(RealD resid) 
  {
    assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
    PlainHermOp<FineField>    Op(_FineOp);
    ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
    for(int k=0;k<nbasis;k++){
-      assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1);
+      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
    }
  }
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,subspace);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
    for(int k=0;k<evec_coarse.size();k++){
      if ( k < nbasis ) { 
@@ -302,34 +353,34 @@ public:
    PlainHermOp<FineField>    Op(_FineOp);
    evals_fine.resize(Nm);
-    _Aggregate.subspace.resize(Nm,_FineGrid);
+    subspace.resize(Nm,_FineGrid);
    ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
    int Nconv;
-    IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
+    IRL.calc(evals_fine,subspace,src,Nconv,false);
    // Shrink down to number saved
    assert(Nstop>=nbasis);
    assert(Nconv>=nbasis);
    evals_fine.resize(nbasis);
-    _Aggregate.subspace.resize(nbasis,_FineGrid);
+    subspace.resize(nbasis,_FineGrid);
  }
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
    Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_Aggregate);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -107,7 +107,12 @@ namespace Grid {
    };
    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -129,7 +134,6 @@ namespace Grid {
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
      /////////////////////////////////////////////////////
@@ -146,6 +150,7 @@ namespace Grid {
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
      guess(src_o,sol_o);
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;
@@ -189,7 +194,12 @@ namespace Grid {
    CBfactorise=cb;
  };
    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -225,6 +235,7 @@ namespace Grid {
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
      guess(src_o,sol_o);
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      ///////////////////////////////////////////////////
@@ -268,7 +279,12 @@ namespace Grid {
    };
    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix,class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -305,6 +321,7 @@ namespace Grid {
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      guess(src_o,tmp);
      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
@@ -347,7 +364,12 @@ namespace Grid {
    };
    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Matrix, class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -385,6 +407,7 @@ namespace Grid {
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 //      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      guess(src_o,tmp);
      _HermitianRBSolver(src_o,tmp);  assert(tmp.checkerboard==Odd);
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -79,6 +79,8 @@ public:
    std::vector<int> _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
    std::vector<int> _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
    bool _isCheckerBoarded; 
 public:
    ////////////////////////////////////////////////////////////////
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@@ -97,6 +97,7 @@ public:
      ///////////////////////
      // Grid information
      ///////////////////////
      _isCheckerBoarded = false;
      _ndimension = dimensions.size();
      _fdimensions.resize(_ndimension);
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -171,9 +171,8 @@ public:
              const std::vector<int> &checker_dim_mask,
              int checker_dim)
    {
-      ///////////////////////
+
-      // Grid information
+      _isCheckerBoarded = true;
      ///////////////////////
      _checker_dim = checker_dim;
      assert(checker_dim_mask[checker_dim] == 1);
      _ndimension = dimensions.size();
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -44,11 +44,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    assert (provided == MPI_THREAD_MULTIPLE);
+    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
    if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
        (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
      assert(0);
  }
  Grid_quiesce_nodes();
  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  GlobalSharedMemory::Init(communicator_world);
@@ -85,9 +89,17 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
 {
  MPI_Comm optimal_comm;
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm); // Remap using the shared memory optimising routine
+  ////////////////////////////////////////////////////
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
  // Free the temp communicator
  ///////////////////////////////////////////////////
  MPI_Comm_free(&optimal_comm);
 }
 //////////////////////////////////
@@ -183,8 +195,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  } else {
    srank = 0;
-    comm_split    = parent.communicator;
+    int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
-    //    std::cout << " Inherited communicator " <<comm_split <<std::endl;
+    assert(ierr==0);
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -196,6 +208,11 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  // Take the right SHM buffers
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  SetCommunicator(comm_split);
  ///////////////////////////////////////////////
  // Free the temp communicator 
  ///////////////////////////////////////////////
  MPI_Comm_free(&comm_split);
  if(0){ 
    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
@@ -210,6 +227,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
 void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
 {
  ////////////////////////////////////////////////////
  // Creates communicator, and the communicator_halo
  ////////////////////////////////////////////////////
  _ndimension = processors.size();
  _processor_coor.resize(_ndimension);
--- a/lib/communicator/SharedMemory.h
+++ b/lib/communicator/SharedMemory.h
@@ -133,6 +133,7 @@ class SharedMemory
 public:
  SharedMemory() {};
  ~SharedMemory();
  ///////////////////////////////////////////////////////////////////////////////////////
  // set the buffers & sizes
  ///////////////////////////////////////////////////////////////////////////////////////
--- a/lib/communicator/SharedMemoryMPI.cc
+++ b/lib/communicator/SharedMemoryMPI.cc
@@ -182,6 +182,7 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -218,6 +219,49 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 };
 #endif // MMAP
 #ifdef GRID_MPI3_SHM_NONE
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hugetlbf and others map filesystems as mappable huge pages
  ////////////////////////////////////////////////////////////////////////////////////////////
  char shm_name [NAME_MAX];
  assert(WorldShmSize == 1);
  for(int r=0;r<WorldShmSize;r++){
    int fd=-1;
    int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
 #ifdef MAP_POPULATE    
    mmap_flag|=MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
    if ( flags ) mmap_flag |= MAP_HUGETLB;
 #endif
    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
    if ( ptr == (void *)MAP_FAILED ) {    
      printf("mmap %s failed\n",shm_name);
      perror("failed mmap");      assert(0);    
    }
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -232,6 +276,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
  std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0); 
  MPI_Barrier(WorldShmComm);
@@ -243,7 +288,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
      size_t size = bytes;
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
+      sprintf(shm_name,"/myGrid_mpi3_shm_%d_%d",WorldNode,r);
      shm_unlink(shm_name);
      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
@@ -259,7 +304,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
-      if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
+      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
      if ( ptr == (void * )MAP_FAILED ) {       
 	perror("failed mmap");     
 	assert(0);    
      }
      assert(((uint64_t)ptr&0x3F)==0);
      WorldShmCommBufs[r] =ptr;
@@ -318,11 +367,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  heap_size = GlobalSharedMemory::ShmAllocBytes();
  for(int r=0;r<ShmSize;r++){
-    uint32_t sr = (r==ShmRank) ? GlobalSharedMemory::WorldRank : 0 ;
+    uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
-    MPI_Allreduce(MPI_IN_PLACE,&sr,1,MPI_UINT32_T,MPI_SUM,comm);
+    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
-    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[sr];
+    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
    //    std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< "  wsr = "<<wsr<<std::endl;
  }
  ShmBufferFreeAll();
@@ -391,5 +441,12 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
    return (void *) remote;
  }
 }
 SharedMemory::~SharedMemory()
 {
  int MPI_is_finalised;  MPI_Finalized(&MPI_is_finalised);
  if ( !MPI_is_finalised ) { 
    MPI_Comm_free(&ShmComm);
  }
 };
 }
--- a/lib/communicator/SharedMemoryNone.cc
+++ b/lib/communicator/SharedMemoryNone.cc
@@ -122,5 +122,7 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  return NULL;
 }
 SharedMemory::~SharedMemory()
 {};
 }
--- a/lib/lattice/Lattice_comparison_utils.h
+++ b/lib/lattice/Lattice_comparison_utils.h
@@ -198,7 +198,7 @@ namespace Grid {
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
-  template<class vsimd>\
+  template<class vsimd,IfSimd<vsimd> = 0>\
    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
    {									\
      return lhs._internal op rhs._internal;				\
--- a/lib/log/Log.h
+++ b/lib/log/Log.h
@@ -86,7 +86,7 @@ protected:
  Colours &Painter;
  int active;
  int timing_mode;
-  int topWidth{-1};
+  int topWidth{-1}, chanWidth{-1};
  static int timestamp;
  std::string name, topName;
  std::string COLOUR;
@@ -126,6 +126,7 @@ public:
    }
  }
  void setTopWidth(const int w) {topWidth = w;}
  void setChanWidth(const int w) {chanWidth = w;}
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
@@ -136,7 +137,12 @@ public:
        stream << std::setw(log.topWidth);
      }
      stream << log.topName << log.background()<< " : ";
-      stream << log.colour() <<  std::left << log.name << log.background() << " : ";
+      stream << log.colour() <<  std::left;
      if (log.chanWidth > 0)
      {
        stream << std::setw(log.chanWidth);
      }
      stream << log.name << log.background() << " : ";
      if ( log.timestamp ) {
 	log.StopWatch->Stop();
 	GridTime now = log.StopWatch->Elapsed();
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -91,7 +91,7 @@ class BinaryIO {
    typedef typename vobj::scalar_object sobj;
    GridBase *grid = lat._grid;
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();
    std::vector<sobj> scalardata(lsites); 
    unvectorizeToLexOrdArray(scalardata,lat);    
@@ -160,7 +160,9 @@ class BinaryIO {
 	/* 
 	 * Scidac csum  is rather more heavyweight
 	 * FIXME -- 128^3 x 256 x 16 will overflow.
 	 */
 	int global_site;
 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);
@@ -261,7 +263,7 @@ class BinaryIO {
 			      GridBase *grid,
 			      std::vector<fobj> &iodata,
 			      std::string file,
-			      Integer offset,
+			      uint64_t offset,
 			      const std::string &format, int control,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
@@ -523,7 +525,7 @@ class BinaryIO {
  static inline void readLatticeObject(Lattice<vobj> &Umu,
 				       std::string file,
 				       munger munge,
-				       Integer offset,
+				       uint64_t offset,
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
@@ -533,7 +535,7 @@ class BinaryIO {
    typedef typename vobj::Realified::scalar_type word;    word w=0;
    GridBase *grid = Umu._grid;
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@@ -544,7 +546,7 @@ class BinaryIO {
    GridStopWatch timer; 
    timer.Start();
-    parallel_for(int x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
+    parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
    vectorizeFromLexOrdArray(scalardata,Umu);    
    grid->Barrier();
@@ -560,7 +562,7 @@ class BinaryIO {
    static inline void writeLatticeObject(Lattice<vobj> &Umu,
 					  std::string file,
 					  munger munge,
-					  Integer offset,
+					  uint64_t offset,
 					  const std::string &format,
 					  uint32_t &nersc_csum,
 					  uint32_t &scidac_csuma,
@@ -569,7 +571,7 @@ class BinaryIO {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
    GridBase *grid = Umu._grid;
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@@ -580,7 +582,7 @@ class BinaryIO {
    GridStopWatch timer; timer.Start();
    unvectorizeToLexOrdArray(scalardata,Umu);    
-    parallel_for(int x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
+    parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
    grid->Barrier();
    timer.Stop();
@@ -597,7 +599,7 @@ class BinaryIO {
  static inline void readRNG(GridSerialRNG &serial,
 			     GridParallelRNG &parallel,
 			     std::string file,
-			     Integer offset,
+			     uint64_t offset,
 			     uint32_t &nersc_csum,
 			     uint32_t &scidac_csuma,
 			     uint32_t &scidac_csumb)
@@ -610,8 +612,8 @@ class BinaryIO {
    std::string format = "IEEE32BIG";
    GridBase *grid = parallel._grid;
-    int gsites = grid->gSites();
+    uint64_t gsites = grid->gSites();
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();
    uint32_t nersc_csum_tmp   = 0;
    uint32_t scidac_csuma_tmp = 0;
@@ -626,7 +628,7 @@ class BinaryIO {
 	     nersc_csum,scidac_csuma,scidac_csumb);
    timer.Start();
-    parallel_for(int lidx=0;lidx<lsites;lidx++){
+    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
      std::vector<RngStateType> tmp(RngStateCount);
      std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
      parallel.SetState(tmp,lidx);
@@ -659,7 +661,7 @@ class BinaryIO {
  static inline void writeRNG(GridSerialRNG &serial,
 			      GridParallelRNG &parallel,
 			      std::string file,
-			      Integer offset,
+			      uint64_t offset,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
 			      uint32_t &scidac_csumb)
@@ -670,8 +672,8 @@ class BinaryIO {
    typedef std::array<RngStateType,RngStateCount> RNGstate;
    GridBase *grid = parallel._grid;
-    int gsites = grid->gSites();
+    uint64_t gsites = grid->gSites();
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();
    uint32_t nersc_csum_tmp;
    uint32_t scidac_csuma_tmp;
@@ -684,7 +686,7 @@ class BinaryIO {
    timer.Start();
    std::vector<RNGstate> iodata(lsites);
-    parallel_for(int lidx=0;lidx<lsites;lidx++){
+    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
      std::vector<RngStateType> tmp(RngStateCount);
      parallel.GetState(tmp,lidx);
      std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
--- a/lib/parallelIO/IldgIO.h
+++ b/lib/parallelIO/IldgIO.h
@@ -337,6 +337,20 @@ class GridLimeWriter : public BinaryIO {
  template<class vobj>
  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    ////////////////////////////////////////////////////////////////////
    // NB: FILE and iostream are jointly writing disjoint sequences in the
    // the same file through different file handles (integer units).
    // 
    // These are both buffered, so why I think this code is right is as follows.
    //
    // i)  write record header to FILE *File, telegraphing the size; flush
    // ii) ftello reads the offset from FILE *File . 
    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
    //      Closes iostream and flushes.
    // iv) fseek on FILE * to end of this disjoint section.
    //  v) Continue writing scidac record.
    ////////////////////////////////////////////////////////////////////
    ////////////////////////////////////////////
    // Create record header
    ////////////////////////////////////////////
@@ -350,25 +364,24 @@ class GridLimeWriter : public BinaryIO {
    //    std::cout << "W Gsites "           <<field._grid->_gsites<<std::endl;
    //    std::cout << "W Payload expected " <<PayloadSize<<std::endl;
-    ////////////////////////////////////////////////////////////////////
+    fflush(File);
-    // NB: FILE and iostream are jointly writing disjoint sequences in the
+
-    // the same file through different file handles (integer units).
+    ///////////////////////////////////////////
-    // 
+    // Write by other means into the binary record
-    // These are both buffered, so why I think this code is right is as follows.
+    ///////////////////////////////////////////
-    //
+    uint64_t offset1 = ftello(File);    //    std::cout << " Writing to offset "<<offset1 << std::endl;
    // i)  write record header to FILE *File, telegraphing the size. 
    // ii) ftello reads the offset from FILE *File .
    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
    //      Closes iostream and flushes.
    // iv) fseek on FILE * to end of this disjoint section.
    //  v) Continue writing scidac record.
    ////////////////////////////////////////////////////////////////////
    uint64_t offset = ftello(File);
    //    std::cout << " Writing to offset "<<offset << std::endl;
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
-    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
+    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
-    //    fseek(File,0,SEEK_END);    offset = ftello(File);std::cout << " offset now "<<offset << std::endl;
+
    ///////////////////////////////////////////
    // Wind forward and close the record
    ///////////////////////////////////////////
    fseek(File,0,SEEK_END);             
    uint64_t offset2 = ftello(File);     //    std::cout << " now at offset "<<offset2 << std::endl;
    assert((offset2-offset1) == PayloadSize);
    err=limeWriterCloseRecord(LimeW);  assert(err>=0);
    ////////////////////////////////////////
@@ -568,7 +581,6 @@ class IldgWriter : public ScidacWriter {
    writeLimeIldgLFN(header.ildg_lfn);                                                 // rec
    writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
    //    limeDestroyWriter(LimeW);
    fclose(File);
  }
 };
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -57,7 +57,7 @@ namespace Grid {
      // for the header-reader
      static inline int readHeader(std::string file,GridBase *grid,  FieldMetaData &field)
      {
-      int offset=0;
+      uint64_t offset=0;
      std::map<std::string,std::string> header;
      std::string line;
@@ -139,7 +139,7 @@ namespace Grid {
      typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
      GridBase *grid = Umu._grid;
-      int offset = readHeader(file,Umu._grid,header);
+      uint64_t offset = readHeader(file,Umu._grid,header);
      FieldMetaData clone(header);
@@ -236,7 +236,7 @@ namespace Grid {
 	GaugeStatistics(Umu,header);
 	MachineCharacteristics(header);
-	int offset;
+	uint64_t offset;
 	truncate(file);
@@ -278,7 +278,7 @@ namespace Grid {
 	header.plaquette=0.0;
 	MachineCharacteristics(header);
-	int offset;
+	uint64_t offset;
 #ifdef RNG_RANLUX
 	header.floating_point = std::string("UINT64");
@@ -313,7 +313,7 @@ namespace Grid {
 	GridBase *grid = parallel._grid;
-	int offset = readHeader(file,grid,header);
+	uint64_t offset = readHeader(file,grid,header);
 	FieldMetaData clone(header);
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -39,6 +39,7 @@ namespace QCD {
    static const int Zdir = 2;
    static const int Tdir = 3;
    static const int Xp = 0;
    static const int Yp = 1;
    static const int Zp = 2;
@@ -420,15 +421,16 @@ namespace QCD {
    //////////////////////////////////////////////
    // Fermion <-> propagator assignements
    //////////////////////////////////////////////
-    template <class Prop, class Ferm>
+    //template <class Prop, class Ferm>
-    void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
+    template <class Fimpl>
      void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
    {
-        for(int j = 0; j < Ns; ++j)
+      for(int j = 0; j < Ns; ++j)
        {
            auto pjs = peekSpin(p, j, s);
            auto fj  = peekSpin(f, j);
-            for(int i = 0; i < Nc; ++i)
+            for(int i = 0; i < Fimpl::Dimension; ++i)
            {
                pokeColour(pjs, peekColour(fj, i), i, c);
            }
@@ -436,15 +438,16 @@ namespace QCD {
        }
    }
-    template <class Prop, class Ferm>
+    //template <class Prop, class Ferm>
-    void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
+    template <class Fimpl>
      void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
    {
        for(int j = 0; j < Ns; ++j)
        {
            auto pjs = peekSpin(p, j, s);
            auto fj  = peekSpin(f, j);
-            for(int i = 0; i < Nc; ++i)
+            for(int i = 0; i < Fimpl::Dimension; ++i)
            {
                pokeColour(fj, peekColour(pjs, i, c), i);
            }
@@ -503,38 +506,6 @@ namespace QCD {
 }   //namespace QCD
 } // Grid
 /*
 <<<<<<< HEAD
 #include <Grid/qcd/utils/SpaceTimeGrid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/spin/TwoSpinor.h>
 #include <Grid/qcd/utils/LinalgUtils.h>
 #include <Grid/qcd/utils/CovariantCshift.h>
 // Include representations  
 #include <Grid/qcd/utils/SUn.h>
 #include <Grid/qcd/utils/SUnAdjoint.h>
 #include <Grid/qcd/utils/SUnTwoIndex.h>
 #include <Grid/qcd/representations/hmc_types.h>
 // Scalar field
 #include <Grid/qcd/utils/ScalarObjs.h>
 #include <Grid/qcd/action/Actions.h>
 #include <Grid/qcd/smearing/Smearing.h>
 #include <Grid/qcd/hmc/integrators/Integrator.h>
 #include <Grid/qcd/hmc/integrators/Integrator_algorithm.h>
 #include <Grid/qcd/observables/hmc_observable.h>
 #include <Grid/qcd/hmc/HMC.h>
 //#include <Grid/qcd/modules/mods.h>
 =======
 >>>>>>> develop
 */
 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -73,7 +73,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
  this->DW(psi,tmp_f,DaggerYes);
  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
  }
 }
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -469,7 +469,7 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie
 	}
 	a0 = a0+incr;
 	a1 = a1+incr;
-	a2 = a2+sizeof(Simd::scalar_type);
+	a2 = a2+sizeof(typename Simd::scalar_type);
      }}
    {
      int lexa = s1+LLs*site;
@@ -701,7 +701,7 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
 	}
 	a0 = a0+incr;
 	a1 = a1+incr;
-	a2 = a2+sizeof(Simd::scalar_type);
+	a2 = a2+sizeof(typename Simd::scalar_type);
      }}
    {
      int lexa = s1+LLs*site;
--- a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
@@ -475,7 +475,7 @@ namespace QCD {
                        }
                        a0 = a0 + incr;
                        a1 = a1 + incr;
-                        a2 = a2 + sizeof(Simd::scalar_type);
+                        a2 = a2 + sizeof(typename Simd::scalar_type);
                    }
                }
--- a/lib/qcd/action/fermion/Fermion.h
+++ b/lib/qcd/action/fermion/Fermion.h
@@ -50,11 +50,13 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>     // 4d wilson like
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
-//#include <Grid/qcd/action/fermion/CloverFermion.h>
+
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
 #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
@@ -104,10 +106,33 @@ typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermi
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
 // Twisted mass fermion
 typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
 typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 // Clover fermions
 typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
 typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
 typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
 typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
 typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
 typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
 typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
 typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
 typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
 typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
--- a/lib/qcd/action/fermion/FermionCore.h
+++ b/lib/qcd/action/fermion/FermionCore.h
@@ -70,7 +70,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 #define TwoIndexFermOpTemplateInstantiate(A) \
  template class A<WilsonTwoIndexSymmetricImplF>; \
-  template class A<WilsonTwoIndexSymmetricImplD>; 
+  template class A<WilsonTwoIndexSymmetricImplD>; \
  template class A<WilsonTwoIndexAntiSymmetricImplF>; \
  template class A<WilsonTwoIndexAntiSymmetricImplD>;
 #define FermOp5dVecTemplateInstantiate(A) \
  template class A<DomainWallVec5dImplF>;	\
--- a/lib/qcd/action/fermion/FermionOperator.h
+++ b/lib/qcd/action/fermion/FermionOperator.h
@@ -125,9 +125,9 @@ namespace Grid {
                                       PropagatorField &q_out,
                                       Current curr_type,
                                       unsigned int mu,
                                       std::vector<Real> mom,
                                       unsigned int tmin, 
-                                       unsigned int tmax)=0;
+                                       unsigned int tmax,
 				       ComplexField &lattice_cmplx)=0;
    };
  }
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -164,6 +164,7 @@ namespace QCD {
    public:
    static const int Dimension = Representation::Dimension;
    static const bool isFundamental = Representation::isFundamental;
    static const bool LsVectorised=false;
    static const int Nhcs = Options::Nhcs;
@@ -261,8 +262,22 @@ namespace QCD {
      GaugeLinkField link(mat._grid);
      link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
      PokeIndex<LorentzIndex>(mat,link,mu);
-    }   
+    }  
    inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
      mat = outerProduct(B,A); 
    }  
    inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
      mat = TraceIndex<SpinIndex>(P); 
    }
    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
      for (int mu = 0; mu < Nd; mu++)
      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
    }
    inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
      int Ls=Btilde._grid->_fdimensions[0];
@@ -284,27 +299,28 @@ namespace QCD {
  ////////////////////////////////////////////////////////////////////////////////////
  // Single flavour four spinors with colour index, 5d redblack
  ////////////////////////////////////////////////////////////////////////////////////
-template<class S,int Nrepresentation=Nc, class Options=CoeffReal>
+template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
-class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > { 
  public:
-  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
  INHERIT_GIMPL_TYPES(Gimpl);
-  static const int Dimension = Nrepresentation;
+  static const int Dimension = Representation::Dimension;
  static const bool isFundamental = Representation::isFundamental;
  static const bool LsVectorised=true;
  static const int Nhcs = Options::Nhcs;
  typedef typename Options::_Coeff_t Coeff_t;      
  typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
-  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
-  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
-  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
-  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhcs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
-  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
-  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
-  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
  typedef iImplSpinor<Simd>            SiteSpinor;
  typedef iImplPropagator<Simd>        SitePropagator;
@@ -340,8 +356,8 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
                       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
                       StencilImpl &St) {
    SiteGaugeLink UU;
-    for (int i = 0; i < Nrepresentation; i++) {
+    for (int i = 0; i < Dimension; i++) {
-      for (int j = 0; j < Nrepresentation; j++) {
+      for (int j = 0; j < Dimension; j++) {
        vsplat(UU()()(i, j), U(mu)()(i, j));
      }
    }
@@ -353,8 +369,8 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
                           const SitePropagator &chi,
                           int mu) {
    SiteGaugeLink UU;
-    for (int i = 0; i < Nrepresentation; i++) {
+    for (int i = 0; i < Dimension; i++) {
-      for (int j = 0; j < Nrepresentation; j++) {
+      for (int j = 0; j < Dimension; j++) {
        vsplat(UU()()(i, j), U(mu)()(i, j));
      }
    }
@@ -393,6 +409,19 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
    assert(0);
  }
  inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
    assert(0);
  } 
  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
    assert(0);
  }
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    assert(0);
  }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
    assert(0);
@@ -445,25 +474,26 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
    ////////////////////////////////////////////////////////////////////////////////////////
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
-template <class S, int Nrepresentation, class Options=CoeffReal>
+template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
-class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
+class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
- static const int Dimension = Nrepresentation;
+ static const int Dimension = Representation::Dimension;
 static const bool isFundamental = Representation::isFundamental;
 static const int Nhcs = Options::Nhcs;
 static const bool LsVectorised=false;
- typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
+ typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
 INHERIT_GIMPL_TYPES(Gimpl);
 typedef typename Options::_Coeff_t Coeff_t;
 typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
- template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>,   Ngp>;
+ template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Dimension>, Ns>,   Ngp>;
- template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>,   Ngp>;
+ template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>,   Ngp>;
- template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>,  Ngp>;
+ template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Dimension>, Nhs>,  Ngp>;
- template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Nrepresentation>, Nhcs>, Ngp>;
+ template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
- template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
+ template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
 typedef iImplSpinor<Simd>            SiteSpinor;
 typedef iImplPropagator<Simd>        SitePropagator;
@@ -636,6 +666,25 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
   return;
 }
 inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
   //mat = outerProduct(Btilde, A);
   assert(0);
  }
  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
    assert(0);
    /*
    auto tmp = TraceIndex<SpinIndex>(P);
    parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
      mat[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1));
    }
    */
  }
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    assert(0);
  }
 inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
   int Ls = Btilde._grid->_fdimensions[0];
@@ -665,6 +714,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
    typedef RealD  _Coeff_t ;
    static const int Dimension = Representation::Dimension;
    static const bool isFundamental = Representation::isFundamental;
    static const bool LsVectorised=false;
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@@ -776,8 +826,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
      GaugeLinkField link(mat._grid);
      link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
      PokeIndex<LorentzIndex>(mat,link,mu);
-    }   
+    } 
-      
+          
    inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
      assert (0); 
      // Must never hit
@@ -793,6 +843,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
    public:
    static const int Dimension = Representation::Dimension;
    static const bool isFundamental = Representation::isFundamental;
    static const bool LsVectorised=true;
    typedef RealD   Coeff_t ;
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@@ -983,29 +1034,33 @@ typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation, CoeffReal > Wilso
 typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplF;  // Float
 typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplD;  // Double
-typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
+typedef WilsonImpl<vComplex,  TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplR;   // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffReal> DomainWallVec5dImplF; // Float
+typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffReal> DomainWallVec5dImplD; // Double
+typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double
 typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
 typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
 typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
-typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
-typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplex> ZDomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplex> ZDomainWallVec5dImplD; // Double
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
-typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
-typedef GparityWilsonImpl<vComplex , Nc,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
-typedef GparityWilsonImpl<vComplexF, Nc,CoeffReal> GparityWilsonImplF;  // Float
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF;  // Float
-typedef GparityWilsonImpl<vComplexD, Nc,CoeffReal> GparityWilsonImplD;  // Double
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD;  // Double
-typedef GparityWilsonImpl<vComplex , Nc,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
-typedef GparityWilsonImpl<vComplexF, Nc,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
-typedef GparityWilsonImpl<vComplexD, Nc,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
 typedef StaggeredImpl<vComplex,  FundamentalRepresentation > StaggeredImplR;   // Real.. whichever prec
 typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF;  // Float
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
@@ -407,17 +407,19 @@ void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q
 }
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
-                                                         PropagatorField &q_out,
+                                              PropagatorField &q_out,
-                                                         Current curr_type,
+                                              Current curr_type,
-                                                         unsigned int mu, 
+                                              unsigned int mu,
-                                                         std::vector<Real> mom,
+                                              unsigned int tmin, 
-                                                         unsigned int tmin,
+                                              unsigned int tmax,
-                                                         unsigned int tmax)
+					      ComplexField &lattice_cmplx)
 {
    assert(0);
 }
 FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
  //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -166,13 +166,13 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
                                PropagatorField &q_out,
                                Current curr_type,
                                unsigned int mu);
-  void SeqConservedCurrent(PropagatorField &q_in,
+  void SeqConservedCurrent(PropagatorField &q_in, 
                           PropagatorField &q_out,
-                           Current curr_type,
+                           Current curr_type, 
-                           unsigned int mu, 
+                           unsigned int mu,
-                           std::vector<Real> mom,
+                           unsigned int tmin, 
-                           unsigned int tmin,
+                           unsigned int tmax,
-                           unsigned int tmax);
+			   ComplexField &lattice_cmplx);
 };
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@@ -419,15 +419,16 @@ void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField
 }
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
-                                                          PropagatorField &q_out,
+                                              PropagatorField &q_out,
-                                                          Current curr_type,
+                                              Current curr_type,
-                                                          unsigned int mu, 
+                                              unsigned int mu,
-                                                          std::vector<Real> mom,
+                                              unsigned int tmin, 
-                                                          unsigned int tmin,
+                                              unsigned int tmax,
-                                                          unsigned int tmax)
+					      ComplexField &lattice_cmplx)
 {
    assert(0);
 }
 FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -178,13 +178,13 @@ namespace QCD {
                                  PropagatorField &q_out,
                                  Current curr_type,
                                  unsigned int mu);
-    void SeqConservedCurrent(PropagatorField &q_in,
+    void SeqConservedCurrent(PropagatorField &q_in, 
                             PropagatorField &q_out,
-                             Current curr_type,
+                             Current curr_type, 
-                             unsigned int mu, 
+                             unsigned int mu,
-                             std::vector<Real> mom,
+                             unsigned int tmin, 
-                             unsigned int tmin,
+                             unsigned int tmax,
-                             unsigned int tmax);
+                 	     ComplexField &lattice_cmplx);
  };
 }}
--- a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
+++ b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
@@ -853,7 +853,7 @@ namespace QCD {
              a0 = a0 + incr;
              a1 = a1 + incr;
-              a2 = a2 + sizeof(Simd::scalar_type);
+              a2 = a2 + sizeof(typename Simd::scalar_type);
            }
          }
--- a/lib/qcd/action/fermion/WilsonCloverFermion.cc
+++ b/lib/qcd/action/fermion/WilsonCloverFermion.cc
@@ -0,0 +1,243 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
    Copyright (C) 2017
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/Eigen/Dense>
 #include <Grid/qcd/spin/Dirac.h>
 namespace Grid
 {
 namespace QCD
 {
 // *NOT* EO
 template <class Impl>
 RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 {
  FermionField temp(out._grid);
  // Wilson term
  out.checkerboard = in.checkerboard;
  this->Dhop(in, out, DaggerNo);
  // Clover term
  Mooee(in, temp);
  out += temp;
  return norm2(out);
 }
 template <class Impl>
 RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 {
  FermionField temp(out._grid);
  // Wilson term
  out.checkerboard = in.checkerboard;
  this->Dhop(in, out, DaggerYes);
  // Clover term
  MooeeDag(in, temp);
  out += temp;
  return norm2(out);
 }
 template <class Impl>
 void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  WilsonFermion<Impl>::ImportGauge(_Umu);
  GridBase *grid = _Umu._grid;
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  // Compute the field strength terms mu>nu
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
  CloverTerm  = fillCloverYZ(Bx) * csw_r;
  CloverTerm += fillCloverXZ(By) * csw_r;
  CloverTerm += fillCloverXY(Bz) * csw_r;
  CloverTerm += fillCloverXT(Ex) * csw_t;
  CloverTerm += fillCloverYT(Ey) * csw_t;
  CloverTerm += fillCloverZT(Ez) * csw_t;
  CloverTerm += diag_mass;
  int lvol = _Umu._grid->lSites();
  int DimRep = Impl::Dimension;
  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
  std::vector<int> lcoor;
  typename SiteCloverType::scalar_object Qx = zero, Qxinv = zero;
  for (int site = 0; site < lvol; site++)
  {
    grid->LocalIndexToLocalCoor(site, lcoor);
    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
    peekLocalSite(Qx, CloverTerm, lcoor);
    Qxinv = zero;
    //if (csw!=0){
    for (int j = 0; j < Ns; j++)
      for (int k = 0; k < Ns; k++)
        for (int a = 0; a < DimRep; a++)
          for (int b = 0; b < DimRep; b++)
            EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
    EigenInvCloverOp = EigenCloverOp.inverse();
    //std::cout << EigenInvCloverOp << std::endl;
    for (int j = 0; j < Ns; j++)
      for (int k = 0; k < Ns; k++)
        for (int a = 0; a < DimRep; a++)
          for (int b = 0; b < DimRep; b++)
            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
    //  }
    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
  }
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
 }
 template <class Impl>
 void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseNo);
 }
 template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseNo);
 }
 template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseYes);
 }
 template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseYes);
 }
 template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.checkerboard = in.checkerboard;
  CloverFieldType *Clover;
  assert(in.checkerboard == Odd || in.checkerboard == Even);
  if (dag)
  {
    if (in._grid->_isCheckerBoarded)
    {
      if (in.checkerboard == Odd)
      {
        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
      }
      else
      {
        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
      }
      out = *Clover * in;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
      out = adj(*Clover) * in;
    }
  }
  else
  {
    if (in._grid->_isCheckerBoarded)
    {
      if (in.checkerboard == Odd)
      {
        //  std::cout << "Calling clover term Odd" << std::endl;
        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
      }
      else
      {
        //  std::cout << "Calling clover term Even" << std::endl;
        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
      }
      out = *Clover * in;
      //  std::cout << GridLogMessage << "*Clover.checkerboard "  << (*Clover).checkerboard << std::endl;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
      out = *Clover * in;
    }
  }
 } // MooeeInternal
 // Derivative parts
 template <class Impl>
 void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 {
  assert(0);
 }
 // Derivative parts
 template <class Impl>
 void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 {
  assert(0); // not implemented yet
 }
 FermOpTemplateInstantiate(WilsonCloverFermion);
 AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
 TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
 //GparityFermOpTemplateInstantiate(WilsonCloverFermion);
 }
 }
--- a/lib/qcd/action/fermion/WilsonCloverFermion.h
+++ b/lib/qcd/action/fermion/WilsonCloverFermion.h
@@ -0,0 +1,366 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
    Copyright (C) 2017
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: David Preti <>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_QCD_WILSON_CLOVER_FERMION_H
 #define GRID_QCD_WILSON_CLOVER_FERMION_H
 #include <Grid/Grid.h>
 namespace Grid
 {
 namespace QCD
 {
 ///////////////////////////////////////////////////////////////////
 // Wilson Clover
 //
 // Operator ( with anisotropy coefficients):
 //
 // Q =   1 + (Nd-1)/xi_0 + m
 //     + W_t + (nu/xi_0) * W_s
 //     - 1/2*[ csw_t * sum_s (sigma_ts F_ts) + (csw_s/xi_0) * sum_ss (sigma_ss F_ss)  ]
 //
 // s spatial, t temporal directions.
 // where W_t and W_s are the temporal and spatial components of the
 // Wilson Dirac operator
 //
 // csw_r = csw_t to recover the isotropic version
 //////////////////////////////////////////////////////////////////
 template <class Impl>
 class WilsonCloverFermion : public WilsonFermion<Impl>
 {
 public:
  // Types definitions
  INHERIT_IMPL_TYPES(Impl);
  template <typename vtype>
  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteCloverType;
  typedef Lattice<SiteCloverType> CloverFieldType;
 public:
  typedef WilsonFermion<Impl> WilsonBase;
  virtual void Instantiatable(void){};
  // Constructors
  WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
                      GridRedBlackCartesian &Hgrid,
                      const RealD _mass,
                      const RealD _csw_r = 0.0,
                      const RealD _csw_t = 0.0,
                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
                                                                                     Fgrid,
                                                                                     Hgrid,
                                                                                     _mass, impl_p, clover_anisotropy),
                                                                 CloverTerm(&Fgrid),
                                                                 CloverTermInv(&Fgrid),
                                                                 CloverTermEven(&Hgrid),
                                                                 CloverTermOdd(&Hgrid),
                                                                 CloverTermInvEven(&Hgrid),
                                                                 CloverTermInvOdd(&Hgrid),
                                                                 CloverTermDagEven(&Hgrid),
                                                                 CloverTermDagOdd(&Hgrid),
                                                                 CloverTermInvDagEven(&Hgrid),
                                                                 CloverTermInvDagOdd(&Hgrid)
  {
    assert(Nd == 4); // require 4 dimensions
    if (clover_anisotropy.isAnisotropic)
    {
      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
    }
    else
    {
      csw_r = _csw_r * 0.5;
      diag_mass = 4.0 + _mass;
    }
    csw_t = _csw_t * 0.5;
    if (csw_r == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
    if (csw_t == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
    ImportGauge(_Umu);
  }
  virtual RealD M(const FermionField &in, FermionField &out);
  virtual RealD Mdag(const FermionField &in, FermionField &out);
  virtual void Mooee(const FermionField &in, FermionField &out);
  virtual void MooeeDag(const FermionField &in, FermionField &out);
  virtual void MooeeInv(const FermionField &in, FermionField &out);
  virtual void MooeeInvDag(const FermionField &in, FermionField &out);
  virtual void MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv);
  //virtual void MDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
  virtual void MooDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
  virtual void MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
  void ImportGauge(const GaugeField &_Umu);
  // Derivative parts unpreconditioned pseudofermions
  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
  {
    conformable(X._grid, Y._grid);
    conformable(X._grid, force._grid);
    GaugeLinkField force_mu(force._grid), lambda(force._grid);
    GaugeField clover_force(force._grid);
    PropagatorField Lambda(force._grid);
    // Guido: Here we are hitting some performance issues:
    // need to extract the components of the DoubledGaugeField
    // for each call
    // Possible solution
    // Create a vector object to store them? (cons: wasting space)
    std::vector<GaugeLinkField> U(Nd, this->Umu._grid);
    Impl::extractLinkField(U, this->Umu);
    force = zero;
    // Derivative of the Wilson hopping term
    this->DhopDeriv(force, X, Y, dag);
    ///////////////////////////////////////////////////////////
    // Clover term derivative
    ///////////////////////////////////////////////////////////
    Impl::outerProductImpl(Lambda, X, Y);
    //std::cout << "Lambda:" << Lambda << std::endl;
    Gamma::Algebra sigma[] = {
        Gamma::Algebra::SigmaXY,
        Gamma::Algebra::SigmaXZ,
        Gamma::Algebra::SigmaXT,
        Gamma::Algebra::MinusSigmaXY,
        Gamma::Algebra::SigmaYZ,
        Gamma::Algebra::SigmaYT,
        Gamma::Algebra::MinusSigmaXZ,
        Gamma::Algebra::MinusSigmaYZ,
        Gamma::Algebra::SigmaZT,
        Gamma::Algebra::MinusSigmaXT,
        Gamma::Algebra::MinusSigmaYT,
        Gamma::Algebra::MinusSigmaZT};
    /*
      sigma_{\mu \nu}=
      | 0         sigma[0]  sigma[1]  sigma[2] |
      | sigma[3]    0       sigma[4]  sigma[5] |
      | sigma[6]  sigma[7]     0      sigma[8] |
      | sigma[9]  sigma[10] sigma[11]   0      |
    */
    int count = 0;
    clover_force = zero;
    for (int mu = 0; mu < 4; mu++)
    {
      force_mu = zero;
      for (int nu = 0; nu < 4; nu++)
      {
        if (mu == nu)
        continue;
        RealD factor;
        if (nu == 4 || mu == 4)
        {
          factor = 2.0 * csw_t;
        }
        else
        {
          factor = 2.0 * csw_r;
        }
        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
        count++;
      }
      pokeLorentz(clover_force, U[mu] * force_mu, mu);
    }
    //clover_force *= csw;
    force += clover_force;
  }
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda._grid, U[0]._grid);
    GaugeLinkField out(lambda._grid), tmp(lambda._grid);
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
 private:
  // here fixing the 4 dimensions, make it more general?
  RealD csw_r;                                               // Clover coefficient - spatial
  RealD csw_t;                                               // Clover coefficient - temporal
  RealD diag_mass;                                           // Mass term
  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F._grid);
    T = zero;
    PARALLEL_FOR_LOOP
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
    {
      T._odata[i]()(0, 1) = timesMinusI(F._odata[i]()());
      T._odata[i]()(1, 0) = timesMinusI(F._odata[i]()());
      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
    }
    return T;
  }
  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F._grid);
    T = zero;
    PARALLEL_FOR_LOOP
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
    {
      T._odata[i]()(0, 1) = -F._odata[i]()();
      T._odata[i]()(1, 0) = F._odata[i]()();
      T._odata[i]()(2, 3) = -F._odata[i]()();
      T._odata[i]()(3, 2) = F._odata[i]()();
    }
    return T;
  }
  CloverFieldType fillCloverXY(const GaugeLinkField &F)
  {
    CloverFieldType T(F._grid);
    T = zero;
    PARALLEL_FOR_LOOP
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
    {
      T._odata[i]()(0, 0) = timesMinusI(F._odata[i]()());
      T._odata[i]()(1, 1) = timesI(F._odata[i]()());
      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
    }
    return T;
  }
  CloverFieldType fillCloverXT(const GaugeLinkField &F)
  {
    CloverFieldType T(F._grid);
    T = zero;
    PARALLEL_FOR_LOOP
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
    {
      T._odata[i]()(0, 1) = timesI(F._odata[i]()());
      T._odata[i]()(1, 0) = timesI(F._odata[i]()());
      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
    }
    return T;
  }
  CloverFieldType fillCloverYT(const GaugeLinkField &F)
  {
    CloverFieldType T(F._grid);
    T = zero;
    PARALLEL_FOR_LOOP
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
    {
      T._odata[i]()(0, 1) = -(F._odata[i]()());
      T._odata[i]()(1, 0) = (F._odata[i]()());
      T._odata[i]()(2, 3) = (F._odata[i]()());
      T._odata[i]()(3, 2) = -(F._odata[i]()());
    }
    return T;
  }
  CloverFieldType fillCloverZT(const GaugeLinkField &F)
  {
    CloverFieldType T(F._grid);
    T = zero;
    PARALLEL_FOR_LOOP
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
    {
      T._odata[i]()(0, 0) = timesI(F._odata[i]()());
      T._odata[i]()(1, 1) = timesMinusI(F._odata[i]()());
      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
    }
    return T;
  }
 };
 }
 }
 #endif // GRID_QCD_WILSON_CLOVER_FERMION_H
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -47,7 +47,8 @@ int WilsonFermionStatic::HandOptDslash;
 template <class Impl>
 WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
                                   GridRedBlackCartesian &Hgrid, RealD _mass,
-                                   const ImplParams &p)
+                                   const ImplParams &p,
                                   const WilsonAnisotropyCoefficients &anis)
    : Kernels(p),
      _grid(&Fgrid),
      _cbgrid(&Hgrid),
@@ -60,16 +61,41 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      Umu(&Fgrid),
      UmuEven(&Hgrid),
      UmuOdd(&Hgrid),
-      _tmp(&Hgrid)
+      _tmp(&Hgrid),
      anisotropyCoeff(anis)
 {
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  if  (anisotropyCoeff.isAnisotropic){
    diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
  } else {
    diag_mass = 4.0 + mass;
  }
 }
 template <class Impl>
 void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
  GaugeField HUmu(_Umu._grid);
-  HUmu = _Umu * (-0.5);
+
  //Here multiply the anisotropy coefficients
  if (anisotropyCoeff.isAnisotropic)
  {
    for (int mu = 0; mu < Nd; mu++)
    {
      GaugeLinkField U_dir = (-0.5)*PeekIndex<LorentzIndex>(_Umu, mu);
      if (mu != anisotropyCoeff.t_direction)
        U_dir *= (anisotropyCoeff.nu / anisotropyCoeff.xi_0);
      PokeIndex<LorentzIndex>(HUmu, U_dir, mu);
    }
  }
  else
  {
    HUmu = _Umu * (-0.5);
  }
  Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
  pickCheckerboard(Even, UmuEven, Umu);
  pickCheckerboard(Odd, UmuOdd, Umu);
@@ -83,14 +109,14 @@ template <class Impl>
 RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
  Dhop(in, out, DaggerNo);
-  return axpy_norm(out, 4 + mass, in, out);
+  return axpy_norm(out, diag_mass, in, out);
 }
 template <class Impl>
 RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
  Dhop(in, out, DaggerYes);
-  return axpy_norm(out, 4 + mass, in, out);
+  return axpy_norm(out, diag_mass, in, out);
 }
 template <class Impl>
@@ -114,7 +140,7 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
 template <class Impl>
 void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
-  typename FermionField::scalar_type scal(4.0 + mass);
+  typename FermionField::scalar_type scal(diag_mass);
  out = scal * in;
 }
@@ -127,7 +153,7 @@ void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
 template<class Impl>
 void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
-  out = (1.0/(4.0+mass))*in;
+  out = (1.0/(diag_mass))*in;
 }
 template<class Impl>
@@ -204,7 +230,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
  FermionField Btilde(B._grid);
  FermionField Atilde(B._grid);
-  Atilde = A;
+  Atilde = A;//redundant
  st.HaloExchange(B, compressor);
@@ -381,40 +407,30 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
    }
 }
 template <class Impl>
 void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
                                              PropagatorField &q_out,
                                              Current curr_type,
                                              unsigned int mu,
                                              std::vector<Real> mom,
                                              unsigned int tmin, 
-                                              unsigned int tmax)
+                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
    conformable(_grid, q_in._grid);
    conformable(_grid, q_out._grid);
    Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
    Complex i(0.0,1.0);
    PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
    unsigned int tshift = (mu == Tp) ? 1 : 0;
    unsigned int LLt    = GridDefaultLatt()[Tp];
    // Momentum projection
    ph = zero;
    for(unsigned int mu = 0; mu < Nd - 1; mu++)
    {
        LatticeCoordinate(coor, mu);
        ph = ph + mom[mu]*coor*((1./(_grid->_fdimensions[mu])));
    }
    ph = exp((Real)(2*M_PI)*i*ph);
    q_out = zero;
    LatticeInteger coords(_grid);
    LatticeCoordinate(coords, Tp);
    // Need q(x + mu) and q(x - mu).
    tmp = Cshift(q_in, mu, 1);
-    tmpFwd = tmp*ph;
+    tmpFwd = tmp*lattice_cmplx;
-    tmp = ph*q_in;
+    tmp = lattice_cmplx*q_in;
    tmpBwd = Cshift(tmp, mu, -1);
    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
@@ -449,6 +465,8 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                                Umu, sU, mu, t_mask);
        }
    }
 }
 FermOpTemplateInstantiate(WilsonFermion);
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -44,6 +44,21 @@ class WilsonFermionStatic {
  static const int npoint = 8;
 };
 struct WilsonAnisotropyCoefficients: Serializable
 {
  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonAnisotropyCoefficients,
  bool, isAnisotropic,
  int, t_direction,
  double, xi_0,
  double, nu);
  WilsonAnisotropyCoefficients():
    isAnisotropic(false), 
    t_direction(Nd-1), 
    xi_0(1.0), 
    nu(1.0){}
 };
 template <class Impl>
 class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
 public:
@@ -65,8 +80,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
  //////////////////////////////////////////////////////////////////
-  RealD M(const FermionField &in, FermionField &out);
+  virtual RealD M(const FermionField &in, FermionField &out);
-  RealD Mdag(const FermionField &in, FermionField &out);
+  virtual RealD Mdag(const FermionField &in, FermionField &out);
  /////////////////////////////////////////////////////////
  // half checkerboard operations
@@ -117,8 +132,9 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  // Constructor
  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
-                GridRedBlackCartesian &Hgrid, RealD _mass,
+                GridRedBlackCartesian &Hgrid, RealD _mass, 
-                const ImplParams &p = ImplParams());
+                const ImplParams &p = ImplParams(), 
                const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu);
@@ -130,6 +146,7 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  //    protected:
 public:
  RealD mass;
  RealD diag_mass;
  GridBase *_grid;
  GridBase *_cbgrid;
@@ -146,6 +163,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  WilsonAnisotropyCoefficients anisotropyCoeff;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
@@ -155,13 +174,13 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
                                PropagatorField &q_out,
                                Current curr_type,
                                unsigned int mu);
-  void SeqConservedCurrent(PropagatorField &q_in,
+  void SeqConservedCurrent(PropagatorField &q_in, 
-                           PropagatorField &q_out,
+                             PropagatorField &q_out,
-                           Current curr_type,
+                             Current curr_type, 
-                           unsigned int mu, 
+                             unsigned int mu,
-                           std::vector<Real> mom,
+                             unsigned int tmin, 
-                           unsigned int tmin,
+                             unsigned int tmax,
-                           unsigned int tmax);
+			     ComplexField &lattice_cmplx);
 };
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -779,92 +779,89 @@ void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 }
 template <class Impl>
 void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
                                                PropagatorField &q_out,
                                                Current curr_type, 
                                                unsigned int mu,
                                                std::vector<Real> mom,
                                                unsigned int tmin, 
-                                                unsigned int tmax)
+                                                unsigned int tmax,
 						ComplexField &lattice_cmplx)
 {
    conformable(q_in._grid, FermionGrid());
    conformable(q_in._grid, q_out._grid);
-    Lattice<iSinglet<Simd>> ph(FermionGrid()), coor(FermionGrid());
+    PropagatorField tmp(GaugeGrid()),tmp2(GaugeGrid());
    PropagatorField tmpFwd(FermionGrid()), tmpBwd(FermionGrid()),
                    tmp(FermionGrid());
    Complex i(0.0, 1.0);
    unsigned int tshift = (mu == Tp) ? 1 : 0;
    unsigned int LLs = q_in._grid->_rdimensions[0];
    unsigned int LLt    = GridDefaultLatt()[Tp];
    // Momentum projection.
    ph = zero;
    for(unsigned int nu = 0; nu < Nd - 1; nu++)
    {
        // Shift coordinate lattice index by 1 to account for 5th dimension.
        LatticeCoordinate(coor, nu + 1);
        ph = ph + mom[nu]*coor*((1./(_FourDimGrid->_fdimensions[nu])));
    }
    ph = exp((Real)(2*M_PI)*i*ph);
    q_out = zero;
    LatticeInteger coords(_FourDimGrid);
    LatticeCoordinate(coords, Tp);
    // Need q(x + mu, s) and q(x - mu, s). 5D lattice so shift 4D coordinate mu
    // by one.
    tmp = Cshift(q_in, mu + 1, 1);
    tmpFwd = tmp*ph;
    tmp = ph*q_in;
    tmpBwd = Cshift(tmp, mu + 1, -1);
-    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    for (unsigned int s = 0; s < LLs; ++s)
    {
-        // Compute the sequential conserved current insertion only if our simd
+        bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
-        // object contains a timeslice we need.
+	bool tadpole_sign = (curr_type == Current::Tadpole);
-        vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
+	bool switch_sgn = tadpole_sign || axial_sign;
                             (coords._odata[sU] <= tmax));
        Integer timeSlices = Reduce(t_mask);
-        if (timeSlices > 0)
+
-        {
+        //forward direction: Need q(x + mu, s)*A(x)
-            unsigned int sF = sU * LLs;
+        ExtractSlice(tmp2, q_in, s, 0);  //q(x,s) 
-            for (unsigned int s = 0; s < LLs; ++s)
+        tmp = Cshift(tmp2, mu, 1);	 //q(x+mu,s)
        tmp2 = tmp*lattice_cmplx;	 //q(x+mu,s)*A(x)	
    	parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
    	{
            // Compute the sequential conserved current insertion only if our simd
            // object contains a timeslice we need.
            vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
                	         (coords._odata[sU] <= tmax));
            Integer timeSlices = Reduce(t_mask);
            if (timeSlices > 0)
            {
-                bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
+		unsigned int sF = sU * LLs + s;
-                Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sF], 
+                Kernels::SeqConservedCurrentSiteFwd(tmp2._odata[sU], 
-                                                    q_out._odata[sF], Umu, sU,
+                                              q_out._odata[sF], Umu, sU,
-                                                    mu, t_mask, axial_sign);
+                                              mu, t_mask, switch_sgn);
                ++sF;
            }
        }
-        // Repeat for backward direction.
+        //backward direction: Need q(x - mu, s)*A(x-mu)
-        t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
+        ExtractSlice(tmp2, q_in, s, 0);  //q(x,s)
-                      (coords._odata[sU] <= (tmax + tshift)));
+        tmp = lattice_cmplx*tmp2;	 //q(x,s)*A(x)
        tmp2 = Cshift(tmp, mu, -1);	 //q(x-mu,s)*A(x-mu,s)
-	//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+    	parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
-	unsigned int t0 = 0;
+    	{
-	if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
+            vInteger  t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
                   	  	    (coords._odata[sU] <= (tmax + tshift)));
-        timeSlices = Reduce(t_mask);
+	    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
 	    unsigned int t0 = 0;
 	    if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
-        if (timeSlices > 0)
+            Integer timeSlices = Reduce(t_mask);
-        {
+
-            unsigned int sF = sU * LLs;
+            if (timeSlices > 0)
            for (unsigned int s = 0; s < LLs; ++s)
            {
-                bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
+		unsigned int sF = sU * LLs + s; 
-                Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sF], 
+        	Kernels::SeqConservedCurrentSiteBwd(tmp2._odata[sU], 
-                                                    q_out._odata[sF], Umu, sU,
+                                             q_out._odata[sF], Umu, sU,
-                                                    mu, t_mask, axial_sign);
+                                             mu, t_mask, axial_sign);
                ++sF;
            }
-        }
+	}
    }
 }
 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -222,13 +222,13 @@ namespace QCD {
                                  PropagatorField &q_out,
                                  Current curr_type, 
                                  unsigned int mu);
-    void SeqConservedCurrent(PropagatorField &q_in,
+    void SeqConservedCurrent(PropagatorField &q_in, 
                             PropagatorField &q_out,
-                             Current curr_type,
+                             Current curr_type, 
                             unsigned int mu,
-                             std::vector<Real> mom,
+                             unsigned int tmin, 
-                             unsigned int tmin,
+                             unsigned int tmax,
-                             unsigned int tmax);
+			     ComplexField &lattice_cmplx);
  };
 }}
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -55,7 +55,7 @@ template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public
 public:
  template <bool EnableBool = true>
-  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
+  typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
  {
@@ -99,7 +99,7 @@ public:
  }
  template <bool EnableBool = true>
-  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
+  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool, void>::type
  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 	   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
    // no kernel choice  
@@ -116,7 +116,7 @@ public:
  }
  template <bool EnableBool = true>
-  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
+  typename std::enable_if<Impl::isFundamental==true && Nc == 3 && EnableBool,void>::type
  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 	      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
 {
@@ -161,7 +161,7 @@ public:
  }
  template <bool EnableBool = true>
-  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
+  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool,void>::type
  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
 		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -946,5 +946,6 @@ INSTANTIATE_THEM(DomainWallVec5dImplFH);
 INSTANTIATE_THEM(DomainWallVec5dImplDF);
 INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
 INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
-
+INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
 INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
 }}
--- a/lib/qcd/action/gauge/WilsonGaugeAction.h
+++ b/lib/qcd/action/gauge/WilsonGaugeAction.h
@@ -71,18 +71,14 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
    RealD factor = 0.5 * beta / RealD(Nc);
-    //GaugeLinkField Umu(U._grid);
+    GaugeLinkField Umu(U._grid);
    GaugeLinkField dSdU_mu(U._grid);
    for (int mu = 0; mu < Nd; mu++) {
-      //Umu = PeekIndex<LorentzIndex>(U, mu);
+      Umu = PeekIndex<LorentzIndex>(U, mu);
      // Staple in direction mu
-      //WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
+      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
-      //dSdU_mu = Ta(Umu * dSdU_mu) * factor;
+      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
      WilsonLoops<Gimpl>::StapleMult(dSdU_mu, U, mu);
      dSdU_mu = Ta(dSdU_mu) * factor;
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
--- a/lib/qcd/modules/ObservableModules.h
+++ b/lib/qcd/modules/ObservableModules.h
@@ -92,6 +92,19 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
  PlaquetteMod(): ObsBase(NoParameters()){}
 };
 template < class Impl >
 class PolyakovMod: public ObservableModule<PolyakovLogger<Impl>, NoParameters>{
  typedef ObservableModule<PolyakovLogger<Impl>, NoParameters> ObsBase;
  using ObsBase::ObsBase; // for constructors
  // acquire resource
  virtual void initialize(){
    this->ObservablePtr.reset(new PolyakovLogger<Impl>());
  }
  public:
  PolyakovMod(): ObsBase(NoParameters()){}
 };
 template < class Impl >
 class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
--- a/lib/qcd/observables/hmc_observable.h
+++ b/lib/qcd/observables/hmc_observable.h
@@ -45,5 +45,7 @@ class HmcObservable {
 #include "plaquette.h"
 #include "topological_charge.h"
 #include "polyakov_loop.h"
 #endif  //  HMC_OBSERVABLE_H
--- a/lib/qcd/observables/polyakov_loop.h
+++ b/lib/qcd/observables/polyakov_loop.h
@@ -0,0 +1,68 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/modules/polyakov_line.h
 Copyright (C) 2017
 Author: David Preti <david.preti@csic.es>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef HMC_POLYAKOV_H
 #define HMC_POLYAKOV_H
 namespace Grid {
 namespace QCD {
 // this is only defined for a gauge theory
 template <class Impl>
 class PolyakovLogger : public HmcObservable<typename Impl::Field> {
 public:
  // here forces the Impl to be of gauge fields
  // if not the compiler will complain
  INHERIT_GIMPL_TYPES(Impl);
  // necessary for HmcObservable compatibility
  typedef typename Impl::Field Field;
  void TrajectoryComplete(int traj,
                          Field &U,
                          GridSerialRNG &sRNG,
                          GridParallelRNG &pRNG) {
    ComplexD polyakov = WilsonLoops<Impl>::avgPolyakovLoop(U);
    int def_prec = std::cout.precision();
    std::cout << GridLogMessage
        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
        << "Polyakov Loop: [ " << traj << " ] "<< polyakov << std::endl;
    std::cout.precision(def_prec);
  }
 };
 }  // namespace QCD
 }  // namespace Grid
 #endif  // HMC_POLYAKOV_H
--- a/Show More
+++ b/Show More