update my fork and fixed conflicts

2025-11-16 11:39:31 +00:00 · 2018-03-02 17:08:08 +00:00
parent 315a42843f 550142bd6a
commit aaf39222c3
48 changed files with 1277 additions and 164 deletions
--- a/README.md
+++ b/README.md
@@ -187,10 +187,11 @@ Alternatively, some CPU codenames can be directly used:
 | `<code>`    | Description                            |
 | ----------- | -------------------------------------- |
 | `KNL`       | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
+| `SKL`       | [Intel Skylake with AVX512 extensions](https://ark.intel.com/products/codename/37572/Skylake#@server) |
 | `BGQ`       | Blue Gene/Q                            |

 #### Notes:
- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
+- We currently support AVX512 for the Intel compiler and GCC (KNL and SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced.
 - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
 - BG/Q performances are currently rather poor. This is being investigated for future versions.
 - The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`.
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -48,7 +48,6 @@ int main (int argc, char ** argv)


  int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  std::vector<int> latt4 = GridDefaultLatt();
  int Ls=16;
@@ -57,6 +56,10 @@ int main (int argc, char ** argv)
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }

+  GridLogLayout();
+
+  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
+

  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -187,7 +190,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;

    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
@@ -226,7 +229,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;

    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -277,7 +280,7 @@ int main (int argc, char ** argv)
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;

    std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -355,7 +358,7 @@ int main (int argc, char ** argv)
      //      sDw.stat.print();

      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
+      double flops=(single_site_flops*volume*ncall)/2.0;

      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
      std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
@@ -478,7 +481,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(1344.0*volume*ncall)/2;
+    double flops=(single_site_flops*volume*ncall)/2.0;

    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -51,6 +51,7 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

+
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
@@ -107,6 +108,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);

  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
@@ -196,7 +198,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  
  if ( ! report ) {
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
  }
  
@@ -228,7 +230,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
    
    if(!report){
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
+      double flops=(single_site_flops*volume*ncall)/2.0;
      std::cout<< flops/(t1-t0);
    }
  }
@@ -237,6 +239,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
 #define CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {
+  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);

  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -321,7 +324,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
    Counter.Report();
  } else { 
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
    std::cout<<"\t"<< flops/(t1-t0);
  }

@@ -358,7 +361,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
    CounterSdw.Report();
  } else {
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(1344.0*volume*ncall)/2;
+    double flops=(single_site_flops*volume*ncall)/2.0;
    std::cout<<"\t"<< flops/(t1-t0);
  }
 }
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@@ -107,7 +107,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;

    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
@@ -134,7 +134,7 @@ int main (int argc, char ** argv)
    FGrid->Barrier();
    
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;

    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -174,7 +174,7 @@ int main (int argc, char ** argv)
    FGrid_d->Barrier();
    
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;

    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -4,7 +4,7 @@

    Source file: ./benchmarks/Benchmark_wilson.cc

-    Copyright (C) 2015
+    Copyright (C) 2018

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
@@ -32,6 +32,9 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;

+
+#include "Grid/util/Profiling.h"
+
 template<class d>
 struct scal {
  d internal;
@@ -45,6 +48,7 @@ struct scal {
  };

 bool overlapComms = false;
+bool perfProfiling = false;

 int main (int argc, char ** argv)
 {
@@ -53,6 +57,12 @@ int main (int argc, char ** argv)
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
    overlapComms = true;
  }
+  if( GridCmdOptionExists(argv,argv+argc,"--perf") ){
+    perfProfiling = true;
+  }
+
+  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
+

  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
@@ -61,10 +71,15 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian     RBGrid(&Grid);

  int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  GridLogLayout();
+
  std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
+  std::cout<<GridLogMessage << "Grid number of colours : "<< QCD::Nc <<std::endl;
+  std::cout<<GridLogMessage << "Benchmarking Wilson operator in the fundamental representation" << std::endl;
+

  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);
@@ -134,9 +149,25 @@ int main (int argc, char ** argv)
    Dw.Dhop(src,result,0);
  }
  double t1=usecond();
-  double flops=1344*volume*ncall;
+  double flops=single_site_flops*volume*ncall;
  
+  if (perfProfiling){
+  std::cout<<GridLogMessage << "Profiling Dw with perf"<<std::endl;
+    
+  System::profile("kernel", [&]() {
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src,result,0);
+    }
+  });
+
+  std::cout<<GridLogMessage << "Generated kernel.data"<<std::endl;
+  std::cout<<GridLogMessage << "Use with: perf report -i kernel.data"<<std::endl;
+
+  }
+
+
  std::cout<<GridLogMessage << "Called Dw"<<std::endl;
+  std::cout<<GridLogMessage << "flops per site " << single_site_flops << std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -62,6 +62,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Number of colours "<< QCD::Nc <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
@@ -69,13 +70,15 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage << "* OpenMP threads       : "<< GridThread::GetThreads() <<std::endl;
+  std::cout << GridLogMessage << "* MPI tasks            : "<< GridCmdVectorIntToString(mpi_layout) << std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;

-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
-  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking Wilson operator in the fundamental representation" << std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs\tWilsonEO/MFLOPs\tWilsonDagEO/MFLOPs" << std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;

  int Lmax = 32;
  int dmin = 0;
@@ -97,13 +100,20 @@ int main (int argc, char ** argv)

 	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
 	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
-	  LatticeFermion    src(&Grid); random(pRNG,src);
-	  LatticeFermion result(&Grid); result=zero;
+	  LatticeFermion        src(&Grid); random(pRNG,src);
+	  LatticeFermion    src_o(&RBGrid); pickCheckerboard(Odd,src_o,src);
+	  LatticeFermion     result(&Grid); result=zero;
+	  LatticeFermion result_e(&RBGrid); result_e=zero;

 	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());

 	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
-      
+
+    // Full operator      
+	  bench_wilson(src,result,Dw,volume,DaggerNo);
+	  bench_wilson(src,result,Dw,volume,DaggerYes);
+    std::cout << "\t";
+    // EO
 	  bench_wilson(src,result,Dw,volume,DaggerNo);
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
 	  std::cout << std::endl;
@@ -122,9 +132,26 @@ void bench_wilson (
 		   int const           dag )
 {
  int ncall    = 1000;
+  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
  double t0    = usecond();
  for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
  double t1    = usecond();
-  double flops = 1344 * volume * ncall;
+  double flops = single_site_flops * volume * ncall;
+  std::cout << flops/(t1-t0) << "\t\t";
+}
+
+void bench_wilson_eo (
+		   LatticeFermion &    src,
+		   LatticeFermion & result,
+		   WilsonFermionR &     Dw,
+		   double const     volume,
+		   int const           dag )
+{
+  int ncall    = 1000;
+  long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
+  double t0    = usecond();
+  for(int i=0; i<ncall; i++) { Dw.DhopEO(src,result,dag); }
+  double t1    = usecond();
+  double flops = (single_site_flops * volume * ncall)/2.0;
  std::cout << flops/(t1-t0) << "\t\t";
 }
--- a/configure.ac
+++ b/configure.ac
@@ -249,6 +249,9 @@ case ${ax_cv_cxx_compiler_vendor} in
      AVX512)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
+      SKL)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for SkyLake Xeon])
+        SIMD_FLAGS='-march=skylake-avx512';;
      KNC)
        AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
        SIMD_FLAGS='';;
--- a/extras/Hadrons/Environment.cc
+++ b/extras/Hadrons/Environment.cc
@@ -270,7 +270,7 @@ int Environment::getObjectModule(const std::string name) const

 unsigned int Environment::getObjectLs(const unsigned int address) const
 {
-    if (hasObject(address))
+    if (hasCreatedObject(address))
    {
        return object_[address].Ls;
    }
--- a/extras/Hadrons/Global.cc
+++ b/extras/Hadrons/Global.cc
@@ -37,20 +37,38 @@ HadronsLogger Hadrons::HadronsLogWarning(1,"Warning");
 HadronsLogger Hadrons::HadronsLogMessage(1,"Message");
 HadronsLogger Hadrons::HadronsLogIterative(1,"Iterative");
 HadronsLogger Hadrons::HadronsLogDebug(1,"Debug");
+HadronsLogger Hadrons::HadronsLogIRL(1,"IRL");

 void Hadrons::initLogger(void)
 {
-    auto w = std::string("Hadrons").length();
+    auto w  = std::string("Hadrons").length();
+    int  cw = 8;
+
+
    GridLogError.setTopWidth(w);
    GridLogWarning.setTopWidth(w);
    GridLogMessage.setTopWidth(w);
    GridLogIterative.setTopWidth(w);
    GridLogDebug.setTopWidth(w);
+    GridLogIRL.setTopWidth(w);
+    GridLogError.setChanWidth(cw);
+    GridLogWarning.setChanWidth(cw);
+    GridLogMessage.setChanWidth(cw);
+    GridLogIterative.setChanWidth(cw);
+    GridLogDebug.setChanWidth(cw);
+    GridLogIRL.setChanWidth(cw);
    HadronsLogError.Active(GridLogError.isActive());
    HadronsLogWarning.Active(GridLogWarning.isActive());
    HadronsLogMessage.Active(GridLogMessage.isActive());
    HadronsLogIterative.Active(GridLogIterative.isActive());
    HadronsLogDebug.Active(GridLogDebug.isActive());
+    HadronsLogIRL.Active(GridLogIRL.isActive());
+    HadronsLogError.setChanWidth(cw);
+    HadronsLogWarning.setChanWidth(cw);
+    HadronsLogMessage.setChanWidth(cw);
+    HadronsLogIterative.setChanWidth(cw);
+    HadronsLogDebug.setChanWidth(cw);
+    HadronsLogIRL.setChanWidth(cw);
 }

 // type utilities //////////////////////////////////////////////////////////////
--- a/extras/Hadrons/Global.hpp
+++ b/extras/Hadrons/Global.hpp
@@ -58,6 +58,9 @@ using Grid::operator<<;
 #ifndef FIMPL
 #define FIMPL WilsonImplR
 #endif
+#ifndef ZFIMPL
+#define ZFIMPL ZWilsonImplR
+#endif
 #ifndef SIMPL
 #define SIMPL ScalarImplCR
 #endif
@@ -111,6 +114,7 @@ extern HadronsLogger HadronsLogWarning;
 extern HadronsLogger HadronsLogMessage;
 extern HadronsLogger HadronsLogIterative;
 extern HadronsLogger HadronsLogDebug;
+extern HadronsLogger HadronsLogIRL;

 void initLogger(void);

@@ -180,6 +184,18 @@ typedef XmlWriter ResultWriter;
 #define RESULT_FILE_NAME(name) \
 name + "." + std::to_string(vm().getTrajectory()) + "." + resultFileExt

+// default Schur convention
+
+#ifndef HADRONS_DEFAULT_SCHUR 
+#define HADRONS_DEFAULT_SCHUR DiagMooee
+#endif
+#define _HADRONS_SCHUR_OP_(conv) Schur##conv##Operator
+#define HADRONS_SCHUR_OP(conv) _HADRONS_SCHUR_OP_(conv)
+#define HADRONS_DEFAULT_SCHUR_OP HADRONS_SCHUR_OP(HADRONS_DEFAULT_SCHUR)
+#define _HADRONS_SCHUR_SOLVE_(conv) SchurRedBlack##conv##Solve
+#define HADRONS_SCHUR_SOLVE(conv) _HADRONS_SCHUR_SOLVE_(conv)
+#define HADRONS_DEFAULT_SCHUR_SOLVE HADRONS_SCHUR_SOLVE(HADRONS_DEFAULT_SCHUR)
+
 END_HADRONS_NAMESPACE

 #include <Grid/Hadrons/Exceptions.hpp>
--- a/extras/Hadrons/LanczosUtils.hpp
+++ b/extras/Hadrons/LanczosUtils.hpp
@@ -0,0 +1,115 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: extras/Hadrons/LanczosUtils.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_LanczosUtils_hpp_
+#define Hadrons_LanczosUtils_hpp_
+
+#include <Grid/Hadrons/Global.hpp>
+#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
+
+BEGIN_HADRONS_NAMESPACE
+
+// Lanczos type
+#ifndef HADRONS_DEFAULT_LANCZOS_NBASIS
+#define HADRONS_DEFAULT_LANCZOS_NBASIS 60
+#endif
+
+template <typename T>
+struct EigenPack
+{
+    typedef T VectorType;
+    std::vector<RealD> eval;
+    std::vector<T>     evec;
+    
+    EigenPack(void) = default;
+
+    EigenPack(const size_t size, GridBase *grid)
+    {
+        resize(size, grid);
+    }
+
+    void resize(const size_t size, GridBase *grid)
+    {
+        eval.resize(size);
+        evec.resize(size, grid);
+    }
+
+    void read(const std::string fileStem)
+    {
+        std::string     evecFilename = fileStem + "_evec.bin";
+        std::string     evalFilename = fileStem + "_eval.xml";
+        emptyUserRecord record;
+        ScidacReader    binReader;
+        XmlReader       xmlReader(evalFilename);
+
+        LOG(Message) << "Reading " << evec.size() << " eigenvectors from '" 
+                     << evecFilename << "'" << std::endl;
+        binReader.open(evecFilename);
+        for(int k = 0; k < evec.size(); ++k) 
+        {
+            binReader.readScidacFieldRecord(evec[k], record);
+        }
+        binReader.close();
+        LOG(Message) << "Reading " << eval.size() << " eigenvalues from '" 
+                     << evalFilename << "'" << std::endl;
+        Grid::read(xmlReader, "evals", eval);
+    }
+
+    void write(const std::string fileStem)
+    {
+        std::string     evecFilename = fileStem + "_evec.bin";
+        std::string     evalFilename = fileStem + "_eval.xml";
+        emptyUserRecord record;
+        ScidacWriter    binWriter;
+        XmlWriter       xmlWriter(evalFilename);
+
+        LOG(Message) << "Writing " << evec.size() << " eigenvectors to '" 
+                     << evecFilename << "'" << std::endl;
+        binWriter.open(fileStem + "_evec.bin");
+        for(int k = 0; k < evec.size(); ++k) 
+        {
+            binWriter.writeScidacFieldRecord(evec[k], record);
+        }
+        binWriter.close();
+        LOG(Message) << "Writing " << eval.size() << " eigenvalues to '" 
+                     << evalFilename << "'" << std::endl;
+        Grid::write(xmlWriter, "evals", eval);
+    }
+};
+
+template <typename FImpl>
+using FineEigenPack = EigenPack<typename FImpl::FermionField>;
+
+template <typename FImpl, int nBasis>
+using CoarseEigenPack = EigenPack<
+    typename LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
+                                   typename FImpl::SiteComplex, 
+                                   nBasis>::CoarseField>;
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_LanczosUtils_hpp_
--- a/extras/Hadrons/Makefile.am
+++ b/extras/Hadrons/Makefile.am
@@ -21,6 +21,7 @@ nobase_libHadrons_a_HEADERS = \
 	GeneticScheduler.hpp      \
 	Global.hpp                \
 	Graph.hpp                 \
+	LanczosUtils.hpp          \
 	Module.hpp                \
 	Modules.hpp               \
 	ModuleFactory.hpp         \
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@@ -1,20 +1,30 @@
 /*************************************************************************************
+
 Grid physics library, www.github.com/paboyle/Grid 
+
 Source file: extras/Hadrons/Modules.hpp
+
 Copyright (C) 2015-2018
+
 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Lanny91 <andrew.lawson@gmail.com>
+Author: pretidav <david.preti@csic.es>
+
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
+
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
+
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
@@ -35,11 +45,12 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Modules/MSource/SeqConserved.hpp>
 #include <Grid/Hadrons/Modules/MSink/Smear.hpp>
 #include <Grid/Hadrons/Modules/MSink/Point.hpp>
+#include <Grid/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
-#include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Grid/Hadrons/Modules/MGauge/FundtoHirep.hpp>
+#include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Grid/Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
 #include <Grid/Hadrons/Modules/MUtilities/TestSeqConserved.hpp>
 #include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
@@ -49,9 +60,11 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Modules/MAction/DWF.hpp>
 #include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
 #include <Grid/Hadrons/Modules/MAction/WilsonClover.hpp>
+#include <Grid/Hadrons/Modules/MAction/ZMobiusDWF.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/Div.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/TrMag.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
 #include <Grid/Hadrons/Modules/MScalarSUN/TrPhi.hpp>
+#include <Grid/Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
 #include <Grid/Hadrons/Modules/MIO/LoadNersc.hpp>
 #include <Grid/Hadrons/Modules/MIO/LoadBinary.hpp>
--- a/extras/Hadrons/Modules/MAction/WilsonClover.hpp
+++ b/extras/Hadrons/Modules/MAction/WilsonClover.hpp
@@ -2,12 +2,13 @@

 Grid physics library, www.github.com/paboyle/Grid 

-Source file: extras/Hadrons/Modules/MAction/Wilson.hpp
+Source file: extras/Hadrons/Modules/MAction/WilsonClover.hpp

-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018

 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: pretidav <david.preti@csic.es>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
--- a/extras/Hadrons/Modules/MAction/ZMobiusDWF.hpp
+++ b/extras/Hadrons/Modules/MAction/ZMobiusDWF.hpp
@@ -0,0 +1,143 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: extras/Hadrons/Modules/MAction/ZMobiusDWF.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MAction_ZMobiusDWF_hpp_
+#define Hadrons_MAction_ZMobiusDWF_hpp_
+
+#include <Grid/Hadrons/Global.hpp>
+#include <Grid/Hadrons/Module.hpp>
+#include <Grid/Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         ZMobiusDWF                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MAction)
+
+class ZMobiusDWFPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ZMobiusDWFPar,
+                                    std::string                      , gauge,
+                                    unsigned int                     , Ls,
+                                    double                           , mass,
+                                    double                           , M5,
+                                    double                           , b,
+                                    double                           , c,
+                                    std::vector<std::complex<double>>, omega,
+                                    std::string                      , boundary);
+};
+
+template <typename FImpl>
+class TZMobiusDWF: public Module<ZMobiusDWFPar>
+{
+public:
+    FGS_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TZMobiusDWF(const std::string name);
+    // destructor
+    virtual ~TZMobiusDWF(void) = default;
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_NS(ZMobiusDWF, TZMobiusDWF<ZFIMPL>, MAction);
+
+/******************************************************************************
+ *                 TZMobiusDWF implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TZMobiusDWF<FImpl>::TZMobiusDWF(const std::string name)
+: Module<ZMobiusDWFPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TZMobiusDWF<FImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().gauge};
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TZMobiusDWF<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TZMobiusDWF<FImpl>::setup(void)
+{
+    LOG(Message) << "Setting up z-Mobius domain wall fermion matrix with m= "
+                 << par().mass << ", M5= " << par().M5 << ", Ls= " << par().Ls 
+                 << ", b= " << par().b << ", c= " << par().c
+                 << " using gauge field '" << par().gauge << "'"
+                 << std::endl;
+    LOG(Message) << "Omegas: " << std::endl;
+    for (unsigned int i = 0; i < par().omega.size(); ++i)
+    {
+        LOG(Message) << "  omega[" << i << "]= " << par().omega[i] << std::endl;
+    }
+    LOG(Message) << "Fermion boundary conditions: " << par().boundary
+                 << std::endl;
+
+    env().createGrid(par().Ls);
+    auto &U    = envGet(LatticeGaugeField, par().gauge);
+    auto &g4   = *env().getGrid();
+    auto &grb4 = *env().getRbGrid();
+    auto &g5   = *env().getGrid(par().Ls);
+    auto &grb5 = *env().getRbGrid(par().Ls);
+    auto omega = par().omega;
+    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
+    typename ZMobiusFermion<FImpl>::ImplParams implParams(boundary);
+    envCreateDerived(FMat, ZMobiusFermion<FImpl>, getName(), par().Ls, U, g5,
+                     grb5, g4, grb4, par().mass, par().M5, omega,
+                     par().b, par().c, implParams);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TZMobiusDWF<FImpl>::execute(void)
+{}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MAction_ZMobiusDWF_hpp_
--- a/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
+++ b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
@@ -7,7 +7,9 @@ Source file: extras/Hadrons/Modules/MFermion/GaugeProp.hpp
 Copyright (C) 2015-2018

 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Lanny91 <andrew.lawson@gmail.com>
+Author: pretidav <david.preti@csic.es>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
--- a/extras/Hadrons/Modules/MGauge/FundtoHirep.cc
+++ b/extras/Hadrons/Modules/MGauge/FundtoHirep.cc
@@ -57,7 +57,7 @@ std::vector<std::string> TFundtoHirep<Rep>::getOutput(void)
 template <typename Rep>
 void TFundtoHirep<Rep>::setup(void)
 {
-    env().template registerLattice<typename Rep::LatticeField>(getName());
+    envCreateLat(typename Rep::LatticeField, getName());
 }

 // execution ///////////////////////////////////////////////////////////////////
@@ -70,6 +70,6 @@ void TFundtoHirep<Rep>::execute(void)
    Rep TargetRepresentation(U._grid);
    TargetRepresentation.update_representation(U);

-   typename Rep::LatticeField &URep = *env().template createLattice<typename Rep::LatticeField>(getName());
+    auto &URep = envGet(typename Rep::LatticeField, getName());
    URep = TargetRepresentation.U;
 }
--- a/extras/Hadrons/Modules/MGauge/FundtoHirep.hpp
+++ b/extras/Hadrons/Modules/MGauge/FundtoHirep.hpp
@@ -4,11 +4,10 @@ Grid physics library, www.github.com/paboyle/Grid

 Source file: extras/Hadrons/Modules/MGauge/FundtoHirep.hpp

-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018

-Author: David Preti <david.preti@to.infn.it>
-	Guido Cossu <guido.cossu@ed.ac.uk>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: pretidav <david.preti@csic.es>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
--- a/extras/Hadrons/Modules/MIO/LoadNersc.cc
+++ b/extras/Hadrons/Modules/MIO/LoadNersc.cc
@@ -71,6 +71,4 @@ void TLoadNersc::execute(void)

    auto &U = envGet(LatticeGaugeField, getName());
    NerscIO::readConfiguration(U, header, fileName);
-    LOG(Message) << "NERSC header:" << std::endl;
-    dump_meta_data(header, LOG(Message));
 }
--- a/extras/Hadrons/Modules/MScalarSUN/Div.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/Div.hpp
@@ -35,7 +35,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE

 /******************************************************************************
- *                         Div                                 *
+ *                       Divergence of a vector field                         *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)

@@ -83,7 +83,7 @@ MODULE_REGISTER_NS(DivSU5, TDiv<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_NS(DivSU6, TDiv<ScalarNxNAdjImplR<6>>, MScalarSUN);

 /******************************************************************************
- *                 TDiv implementation                             *
+ *                           TDiv implementation                              *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
--- a/extras/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
@@ -0,0 +1,199 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: extras/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_TrKinetic_hpp_
+#define Hadrons_MScalarSUN_TrKinetic_hpp_
+
+#include <Grid/Hadrons/Global.hpp>
+#include <Grid/Hadrons/Module.hpp>
+#include <Grid/Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         Trace of kinetic term                              *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+class TrKineticPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_ENUM(DiffType, undef, forward, 1, backward, 2, central, 3);
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TrKineticPar,
+                                    std::string,  field,
+                                    DiffType,     type,
+                                    std::string,  output);
+};
+
+template <typename SImpl>
+class TTrKinetic: public Module<TrKineticPar>
+{
+public:
+    typedef typename SImpl::Field        Field;
+    typedef typename SImpl::ComplexField ComplexField;
+    class Result: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
+                                        std::string, op,
+                                        Complex    , value);
+    };
+public:
+    // constructor
+    TTrKinetic(const std::string name);
+    // destructor
+    virtual ~TTrKinetic(void) = default;
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+private:
+    std::string outName(const unsigned int mu, const unsigned int nu);
+    std::string bufName(const unsigned int mu);
+};
+
+MODULE_REGISTER_NS(TrKineticSU2, TTrKinetic<ScalarNxNAdjImplR<2>>, MScalarSUN);
+MODULE_REGISTER_NS(TrKineticSU3, TTrKinetic<ScalarNxNAdjImplR<3>>, MScalarSUN);
+MODULE_REGISTER_NS(TrKineticSU4, TTrKinetic<ScalarNxNAdjImplR<4>>, MScalarSUN);
+MODULE_REGISTER_NS(TrKineticSU5, TTrKinetic<ScalarNxNAdjImplR<5>>, MScalarSUN);
+MODULE_REGISTER_NS(TrKineticSU6, TTrKinetic<ScalarNxNAdjImplR<6>>, MScalarSUN);
+
+/******************************************************************************
+ *                      TTrKinetic implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TTrKinetic<SImpl>::TTrKinetic(const std::string name)
+: Module<TrKineticPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TTrKinetic<SImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().field};
+    
+    return in;
+}
+
+template <typename SImpl>
+std::vector<std::string> TTrKinetic<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out ;
+
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    {
+        out.push_back(outName(mu, nu));
+    }
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTrKinetic<SImpl>::setup(void)
+{
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    {
+        envCreateLat(ComplexField, outName(mu, nu));
+    }
+    envTmp(std::vector<Field>, "der", 1, env().getNd(), env().getGrid());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTrKinetic<SImpl>::execute(void)
+{
+    LOG(Message) << "Computing tr(d_mu phi*d_nu phi) using " << par().type
+                 << " derivative" << std::endl; 
+
+    std::vector<Result> result;
+    auto                &phi = envGet(Field, par().field);
+
+    envGetTmp(std::vector<Field>, der);
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        switch(par().type)
+        {
+            case TrKineticPar::DiffType::backward:
+                der[mu] = phi - Cshift(phi, mu, -1);
+                break;
+            case TrKineticPar::DiffType::forward:
+                der[mu] = Cshift(phi, mu, 1) - phi;
+                break;
+            case TrKineticPar::DiffType::central:
+                der[mu] = 0.5*(Cshift(phi, mu, 1) - Cshift(phi, mu, -1));
+                break;
+        }
+    }
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    {
+        auto &out = envGet(ComplexField, outName(mu, nu));
+
+        out = -trace(der[mu]*der[nu]);
+        if (!par().output.empty())
+        {
+            Result r;
+
+            r.op    = "tr(d_" + std::to_string(mu) + "phi*d_" 
+                      + std::to_string(nu) + "phi)";
+            r.value = TensorRemove(sum(out));
+            result.push_back(r);
+        }
+    }
+    if (result.size() > 0)
+    {
+        saveResult(par().output, "trkinetic", result);
+    }
+}
+
+// variable name generators ////////////////////////////////////////////////////
+template <typename SImpl>
+std::string TTrKinetic<SImpl>::outName(const unsigned int mu, 
+                                       const unsigned int nu)
+{
+    return getName() + "_" + std::to_string(mu) + "_" + std::to_string(nu);
+}
+
+template <typename SImpl>
+std::string TTrKinetic<SImpl>::bufName(const unsigned int mu)
+{
+    return "d_" + std::to_string(mu);
+}
+
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_TrKinetic_hpp_
--- a/extras/Hadrons/Modules/MScalarSUN/TrMag.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TrMag.hpp
@@ -35,7 +35,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE

 /******************************************************************************
- *                       Module to compute tr(mag^n)                          *
+ *                     Trace of powers of the magnetisation                   *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)

@@ -117,7 +117,7 @@ template <typename SImpl>
 void TTrMag<SImpl>::execute(void)
 {
    LOG(Message) << "Computing tr(mag^n) for n even up to " << par().maxPow
-                 << "..." << std::endl;
+                 << std::endl;

    std::vector<Result> result;
    auto                &phi = envGet(Field, par().field);
--- a/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
@@ -35,7 +35,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE

 /******************************************************************************
- *                         Module to compute tr(phi^n)                        *
+ *                      Trace of powers of a scalar field                     *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalarSUN)

@@ -136,7 +136,7 @@ template <typename SImpl>
 void TTrPhi<SImpl>::execute(void)
 {
    LOG(Message) << "Computing tr(phi^n) for n even up to " << par().maxPow
-                 << "..." << std::endl; 
+                 << std::endl; 

    std::vector<Result> result;
    auto                &phi = envGet(Field, par().field);
--- a/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
@@ -87,7 +87,7 @@ MODULE_REGISTER_NS(TwoPointSU5, TTwoPoint<ScalarNxNAdjImplR<5>>, MScalarSUN);
 MODULE_REGISTER_NS(TwoPointSU6, TTwoPoint<ScalarNxNAdjImplR<6>>, MScalarSUN);

 /******************************************************************************
- *                 TTwoPoint implementation                             *
+ *                       TTwoPoint implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename SImpl>
--- a/extras/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
+++ b/extras/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
@@ -0,0 +1,249 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: extras/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MSolver_LocalCoherenceLanczos_hpp_
+#define Hadrons_MSolver_LocalCoherenceLanczos_hpp_
+
+#include <Grid/Hadrons/Global.hpp>
+#include <Grid/Hadrons/Module.hpp>
+#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Grid/Hadrons/LanczosUtils.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                    Local coherence Lanczos eigensolver                     *
+ *****************************************************************************/
+BEGIN_MODULE_NAMESPACE(MSolver)
+
+class LocalCoherenceLanczosPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosPar,
+                                    std::string,   action,
+                                    int,           doFine,
+                                    int,           doCoarse,
+                                    LanczosParams, fineParams,
+                                    LanczosParams, coarseParams,
+                                    ChebyParams,   smoother,
+                                    RealD,         coarseRelaxTol,
+                                    std::string,   blockSize,
+                                    std::string,   output);
+};
+
+template <typename FImpl, int nBasis>
+class TLocalCoherenceLanczos: public Module<LocalCoherenceLanczosPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    typedef LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
+                                  typename FImpl::SiteComplex, 
+                                  nBasis>                LCL;
+    typedef FineEigenPack<FImpl>                         FinePack;
+    typedef CoarseEigenPack<FImpl, nBasis>               CoarsePack; 
+    typedef HADRONS_DEFAULT_SCHUR_OP<FMat, FermionField> SchurFMat;
+public:
+    // constructor
+    TLocalCoherenceLanczos(const std::string name);
+    // destructor
+    virtual ~TLocalCoherenceLanczos(void) = default;
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+private:
+    void makeCoarseGrid(void);
+private:
+    std::vector<int>                       coarseDim_;
+    int                                    Ls_, cLs_{1};
+    std::unique_ptr<GridCartesian>         coarseGrid4_{nullptr};
+    std::unique_ptr<GridCartesian>         coarseGrid_{nullptr};
+    std::unique_ptr<GridRedBlackCartesian> coarseGrid4Rb_{nullptr};
+    std::unique_ptr<GridRedBlackCartesian> coarseGridRb_{nullptr};
+    std::string                            fineName_, coarseName_;
+};
+
+MODULE_REGISTER_NS(LocalCoherenceLanczos, 
+    ARG(TLocalCoherenceLanczos<FIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>), 
+    MSolver);
+MODULE_REGISTER_NS(ZLocalCoherenceLanczos, 
+    ARG(TLocalCoherenceLanczos<ZFIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>), 
+    MSolver);
+
+/******************************************************************************
+ *                 TLocalCoherenceLanczos implementation                      *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl, int nBasis>
+TLocalCoherenceLanczos<FImpl, nBasis>::TLocalCoherenceLanczos(const std::string name)
+: Module<LocalCoherenceLanczosPar>(name)
+{
+    fineName_   = getName() + "_fine";
+    coarseName_ = getName() + "_coarse";
+}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl, int nBasis>
+std::vector<std::string> TLocalCoherenceLanczos<FImpl, nBasis>::getInput(void)
+{
+    std::vector<std::string> in = {par().action};
+    
+    return in;
+}
+
+template <typename FImpl, int nBasis>
+std::vector<std::string> TLocalCoherenceLanczos<FImpl, nBasis>::getOutput(void)
+{
+    std::vector<std::string> out = {fineName_, coarseName_};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl, int nBasis>
+void TLocalCoherenceLanczos<FImpl, nBasis>::makeCoarseGrid(void)
+{
+    int              nd        = env().getNd();
+    std::vector<int> blockSize = strToVec<int>(par().blockSize);
+    auto             fineDim   = env().getDim();
+
+    Ls_ = env().getObjectLs(par().action);
+    env().createGrid(Ls_);
+    coarseDim_.resize(nd);
+    for (int d = 0; d < coarseDim_.size(); d++)
+    {
+        coarseDim_[d] = fineDim[d]/blockSize[d];
+        if (coarseDim_[d]*blockSize[d] != fineDim[d])
+        {
+            HADRON_ERROR(Size, "Fine dimension " + std::to_string(d) 
+                         + " (" + std::to_string(fineDim[d]) 
+                         + ") not divisible by coarse dimension ("
+                         + std::to_string(coarseDim_[d]) + ")"); 
+        }
+    }
+    if (blockSize.size() > nd)
+    {
+        cLs_ = Ls_/blockSize[nd];
+        if (cLs_*blockSize[nd] != Ls_)
+        {
+            HADRON_ERROR(Size, "Fine Ls (" + std::to_string(Ls_) 
+                         + ") not divisible by coarse Ls ("
+                         + std::to_string(cLs_) + ")");
+        }
+    }
+    if (Ls_ > 1)
+    {
+        coarseGrid4_.reset(SpaceTimeGrid::makeFourDimGrid(
+            coarseDim_, GridDefaultSimd(nd, vComplex::Nsimd()),
+            GridDefaultMpi()));
+        coarseGrid4Rb_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(coarseGrid4_.get()));
+        coarseGrid_.reset(SpaceTimeGrid::makeFiveDimGrid(cLs_, coarseGrid4_.get()));
+        coarseGridRb_.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(cLs_, coarseGrid4_.get()));
+    }
+    else
+    {
+        coarseGrid_.reset(SpaceTimeGrid::makeFourDimGrid(
+            coarseDim_, GridDefaultSimd(nd, vComplex::Nsimd()),
+            GridDefaultMpi()));
+        coarseGridRb_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(coarseGrid_.get()));
+    }
+}
+
+template <typename FImpl, int nBasis>
+void TLocalCoherenceLanczos<FImpl, nBasis>::setup(void)
+{
+    LOG(Message) << "Setting up local coherence Lanczos eigensolver for"
+                 << " action '" << par().action << "' (" << nBasis
+                 << " eigenvectors)..." << std::endl;
+    
+    if (!coarseGrid_)
+    {
+        makeCoarseGrid();
+    }
+    LOG(Message) << "Coarse grid: " << coarseGrid_->GlobalDimensions() << std::endl;
+    envCreate(FinePack, fineName_, Ls_, par().fineParams.Nm, env().getRbGrid(Ls_));
+    envCreate(CoarsePack, coarseName_, Ls_, par().coarseParams.Nm, coarseGridRb_.get());
+    auto &fine   = envGet(FinePack, fineName_);
+    auto &coarse = envGet(CoarsePack, coarseName_);
+    envTmp(SchurFMat, "mat", Ls_, envGet(FMat, par().action));
+    envGetTmp(SchurFMat, mat);
+    envTmp(LCL, "solver", Ls_, env().getRbGrid(Ls_), coarseGridRb_.get(), mat, 
+           Odd, fine.evec, coarse.evec, fine.eval, coarse.eval);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl, int nBasis>
+void TLocalCoherenceLanczos<FImpl, nBasis>::execute(void)
+{
+    auto &finePar   = par().fineParams;
+    auto &coarsePar = par().coarseParams;
+    auto &fine      = envGet(FinePack, fineName_);
+    auto &coarse    = envGet(CoarsePack, coarseName_);
+
+    envGetTmp(LCL, solver);
+    if (par().doFine)
+    {
+        LOG(Message) << "Performing fine grid IRL -- Nstop= " 
+                     << finePar.Nstop << ", Nk= " << finePar.Nk << ", Nm= " 
+                     << finePar.Nm << std::endl;
+        solver.calcFine(finePar.Cheby, finePar.Nstop, finePar.Nk, finePar.Nm,
+                        finePar.resid,finePar.MaxIt, finePar.betastp, 
+                        finePar.MinRes);
+        solver.testFine(finePar.resid*100.0);
+        LOG(Message) << "Orthogonalising" << std::endl;
+        solver.Orthogonalise();
+        if (!par().output.empty())
+        {
+            fine.write(par().output + "_fine");
+        }
+    }
+    if (par().doCoarse)
+    {
+        LOG(Message) << "Performing coarse grid IRL -- Nstop= " 
+                     << coarsePar.Nstop << ", Nk= " << coarsePar.Nk << ", Nm= " 
+                     << coarsePar.Nm << std::endl;
+        solver.calcCoarse(coarsePar.Cheby, par().smoother, par().coarseRelaxTol,
+			              coarsePar.Nstop, coarsePar.Nk, coarsePar.Nm, 
+                          coarsePar.resid, coarsePar.MaxIt, coarsePar.betastp, 
+                          coarsePar.MinRes);
+        solver.testCoarse(coarsePar.resid*100.0, par().smoother, 
+                          par().coarseRelaxTol);
+        if (!par().output.empty())
+        {
+            coarse.write(par().output + "_coarse");
+        }
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MSolver_LocalCoherenceLanczos_hpp_
--- a/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
+++ b/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
@@ -43,9 +43,10 @@ BEGIN_MODULE_NAMESPACE(MSolver)
 class RBPrecCGPar: Serializable
 {
 public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(RBPrecCGPar,
-                                    std::string, action,
-                                    double     , residual);
+    GRID_SERIALIZABLE_CLASS_MEMBERS(RBPrecCGPar ,
+                                    std::string    , action,
+                                    unsigned int   , maxIteration,
+                                    double         , residual);
 };

 template <typename FImpl>
@@ -69,7 +70,8 @@ protected:
    virtual void execute(void);
 };

-MODULE_REGISTER_NS(RBPrecCG, TRBPrecCG<FIMPL>, MSolver);
+MODULE_REGISTER_NS(RBPrecCG,  TRBPrecCG<FIMPL>, MSolver);
+MODULE_REGISTER_NS(ZRBPrecCG, TRBPrecCG<ZFIMPL>, MSolver);

 /******************************************************************************
 *                      TRBPrecCG template implementation                     *
@@ -117,14 +119,16 @@ void TRBPrecCG<FImpl>::setup(void)
    auto &mat   = envGet(FMat, par().action);
    auto solver = [&mat, this](FermionField &sol, const FermionField &source)
    {
-        ConjugateGradient<FermionField>           cg(par().residual, 10000);
-        SchurRedBlackDiagMooeeSolve<FermionField> schurSolver(cg);
+        ConjugateGradient<FermionField>           cg(par().residual, 
+                                                     par().maxIteration);
+        HADRONS_DEFAULT_SCHUR_SOLVE<FermionField> schurSolver(cg);
        
        schurSolver(mat, source, sol);
    };
    envCreate(SolverFn, getName(), Ls, solver);
 }

+
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TRBPrecCG<FImpl>::execute(void)
--- a/extras/Hadrons/VirtualMachine.cc
+++ b/extras/Hadrons/VirtualMachine.cc
@@ -424,11 +424,17 @@ void VirtualMachine::memoryProfile(const unsigned int address)
        cleanEnvironment();
        for (auto &in: m->getInput())
        {
-            memoryProfile(env().getObjectModule(in));
+            if (!env().hasCreatedObject(in))
+            {
+                memoryProfile(env().getObjectModule(in));
+            }
        }
        for (auto &ref: m->getReference())
        {
-            memoryProfile(env().getObjectModule(ref));
+            if (!env().hasCreatedObject(ref))
+            {
+                memoryProfile(env().getObjectModule(ref));
+            }
        }
        m->setup();
        updateProfile(address);
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@@ -1,6 +1,4 @@
 modules_cc =\
-  Modules/MScalar/ChargedProp.cc \
-  Modules/MScalar/FreeProp.cc \
  Modules/MContraction/WeakHamiltonianEye.cc \
  Modules/MContraction/WeakNeutral4ptDisc.cc \
  Modules/MContraction/WeakHamiltonianNonEye.cc \
@@ -30,11 +28,12 @@ modules_hpp =\
  Modules/MSource/SeqConserved.hpp \
  Modules/MSink/Smear.hpp \
  Modules/MSink/Point.hpp \
+  Modules/MSolver/LocalCoherenceLanczos.hpp \
  Modules/MSolver/RBPrecCG.hpp \
  Modules/MGauge/Unit.hpp \
  Modules/MGauge/Random.hpp \
-  Modules/MGauge/StochEm.hpp \
  Modules/MGauge/FundtoHirep.hpp \
+  Modules/MGauge/StochEm.hpp \
  Modules/MUtilities/TestSeqGamma.hpp \
  Modules/MUtilities/TestSeqConserved.hpp \
  Modules/MLoop/NoiseLoop.hpp \
@@ -44,10 +43,12 @@ modules_hpp =\
  Modules/MAction/DWF.hpp \
  Modules/MAction/Wilson.hpp \
  Modules/MAction/WilsonClover.hpp \
+  Modules/MAction/ZMobiusDWF.hpp \
  Modules/MScalarSUN/Div.hpp \
  Modules/MScalarSUN/TrMag.hpp \
  Modules/MScalarSUN/TwoPoint.hpp \
  Modules/MScalarSUN/TrPhi.hpp \
+  Modules/MScalarSUN/TrKinetic.hpp \
  Modules/MIO/LoadNersc.hpp \
  Modules/MIO/LoadBinary.hpp

--- a/lib/algorithms/Algorithms.h
+++ b/lib/algorithms/Algorithms.h
@@ -39,6 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>

+#include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
--- a/lib/algorithms/iterative/Deflation.h
+++ b/lib/algorithms/iterative/Deflation.h
@@ -0,0 +1,101 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_DEFLATION_H
+#define GRID_DEFLATION_H
+
+namespace Grid { 
+
+struct ZeroGuesser {
+public:
+  template<class Field> 
+  void operator()(const Field &src,Field &guess) { guess = Zero(); };
+};
+struct SourceGuesser {
+public:
+  template<class Field> 
+  void operator()(const Field &src,Field &guess) { guess = src; };
+};
+
+////////////////////////////////
+// Fine grid deflation
+////////////////////////////////
+template<class Field>
+struct DeflatedGuesser {
+private:
+  const std::vector<Field> &evec;
+  const std::vector<RealD> &eval;
+
+public:
+
+  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
+
+  void operator()(const Field &src,Field &guess) { 
+    guess = zero;
+    assert(evec.size()==eval.size());
+    auto N = evec.size();
+    for (int i=0;i<N;i++) {
+      Field& tmp = evec[i];
+      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
+    }
+  }
+};
+
+template<class FineField, class CoarseField>
+class LocalCoherenceDeflatedGuesser {
+private:
+  const std::vector<FineField>   &subspace;
+  const std::vector<CoarseField> &evec_coarse;
+  const std::vector<RealD>       &eval_coarse;
+public:
+  
+  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
+				const std::vector<CoarseField> &_evec_coarse,
+				const std::vector<RealD>       &_eval_coarse)
+    : subspace(_subspace), 
+      evec_coarse(_evec_coarse), 
+      eval_coarse(_eval_coarse)  
+  {
+  }
+  
+  void operator()(const FineField &src,FineField &guess) { 
+    int N = (int)evec_coarse.size();
+    CoarseField src_coarse(evec_coarse[0]._grid);
+    CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero;
+    blockProject(src,src_coarse,subspace);    
+    for (int i=0;i<N;i++) {
+      CoarseField & tmp = evec_coarse[i];
+      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
+    }
+    blockPromote(guess_coarse,guess,subspace);
+  };
+};
+
+
+
+}
+#endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -149,19 +149,6 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
  basisReorderInPlace(_v,sort_vals,idx);
 }

-// PAB: faster to compute the inner products first then fuse loops.
-// If performance critical can improve.
-template<class Field>
-void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
-  result = zero;
-  assert(_v.size()==eval.size());
-  int N = (int)_v.size();
-  for (int i=0;i<N;i++) {
-    Field& tmp = _v[i];
-    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
-  }
-}
-
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
@@ -181,6 +168,7 @@ enum IRLdiagonalisation {
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
 public:
+
  LinearFunction<Field>       &_HermOp;
  ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
  int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
@@ -243,6 +231,7 @@ class ImplicitlyRestartedLanczos {
  /////////////////////////
  
 public:       
+
  //////////////////////////////////////////////////////////////////
  // PAB:
  //////////////////////////////////////////////////////////////////
--- a/lib/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/lib/algorithms/iterative/LocalCoherenceLanczos.h
@@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #ifndef GRID_LOCAL_COHERENCE_IRL_H
 #define GRID_LOCAL_COHERENCE_IRL_H
+
 namespace Grid { 
+
+
 struct LanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
@@ -70,21 +73,24 @@ public:
  typedef Lattice<Fobj>          FineField;

  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;

-  ProjectedHermOp(LinearOperatorBase<FineField>& linop,  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
-    _Linop(linop),
-    _Aggregate(aggregate)  {  };
+  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
+    _Linop(linop), subspace(_subspace)
+  {  
+    assert(subspace.size() >0);
+  };

  void operator()(const CoarseField& in, CoarseField& out) {
+    GridBase *FineGrid = subspace[0]._grid;    
+    int   checkerboard = subspace[0].checkerboard;
+      
+    FineField fin (FineGrid);     fin.checkerboard= checkerboard;
+    FineField fout(FineGrid);   fout.checkerboard = checkerboard;

-    GridBase *FineGrid = _Aggregate.FineGrid;
-    FineField fin(FineGrid);
-    FineField fout(FineGrid);
-
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
-    _Linop.HermOp(fin,fout);                   std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
+    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
+    _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
+    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
  }
 };

@@ -99,24 +105,27 @@ public:

  OperatorFunction<FineField>   & _poly;
  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;

-  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop, 
-			  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
+  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
+			  LinearOperatorBase<FineField>& linop, 
+			  std::vector<FineField> & _subspace) :
    _poly(poly),
    _Linop(linop),
-    _Aggregate(aggregate)  {  };
+    subspace(_subspace)
+  {  };

  void operator()(const CoarseField& in, CoarseField& out) {
-
-    GridBase *FineGrid = _Aggregate.FineGrid;
-
-    FineField fin(FineGrid) ;fin.checkerboard  =_Aggregate.checkerboard;
-    FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard;
    
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
+    GridBase *FineGrid = subspace[0]._grid;    
+    int   checkerboard = subspace[0].checkerboard;
+
+    FineField fin (FineGrid); fin.checkerboard =checkerboard;
+    FineField fout(FineGrid);fout.checkerboard =checkerboard;
+    
+    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
    _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
+    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
  }
 };

@@ -132,19 +141,23 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
  LinearFunction<CoarseField> & _Poly;
  OperatorFunction<FineField>   & _smoother;
  LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
-  RealD                             _coarse_relax_tol;
+  RealD                          _coarse_relax_tol;
+  std::vector<FineField>        &_subspace;
+  
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
-					   Aggregation<Fobj,CComplex,nbasis> &Aggregate,
+					   std::vector<FineField>        &subspace,
 					   RealD coarse_relax_tol=5.0e3) 
-    : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol)  {    };
+    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
+      _coarse_relax_tol(coarse_relax_tol)  
+  {    };

  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
    RealD eval_poly = eval;
+
    // Apply operator
    _Poly(B,v);

@@ -168,14 +181,13 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
  }
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
-    GridBase *FineGrid = _Aggregate.FineGrid;
-
-    int checkerboard   = _Aggregate.checkerboard;
-
+    GridBase *FineGrid = _subspace[0]._grid;    
+    int checkerboard   = _subspace[0].checkerboard;
    FineField fB(FineGrid);fB.checkerboard =checkerboard;
    FineField fv(FineGrid);fv.checkerboard =checkerboard;

-    _Aggregate.PromoteFromSubspace(B,fv);
+    blockPromote(B,fv,_subspace);  
+    
    _smoother(_Linop,fv,fB); 

    RealD eval_poly = eval;
@@ -217,27 +229,65 @@ protected:
  int _checkerboard;
  LinearOperatorBase<FineField>                 & _FineOp;
  
-  // FIXME replace Aggregation with vector of fine; the code reuse is too small for
-  // the hassle and complexity of cross coupling.
-  Aggregation<Fobj,CComplex,nbasis>               _Aggregate;  
-  std::vector<RealD>                              evals_fine;
-  std::vector<RealD>                              evals_coarse; 
-  std::vector<CoarseField>                        evec_coarse;
+  std::vector<RealD>                              &evals_fine;
+  std::vector<RealD>                              &evals_coarse; 
+  std::vector<FineField>                          &subspace;
+  std::vector<CoarseField>                        &evec_coarse;
+
+private:
+  std::vector<RealD>                              _evals_fine;
+  std::vector<RealD>                              _evals_coarse; 
+  std::vector<FineField>                          _subspace;
+  std::vector<CoarseField>                        _evec_coarse;
+
 public:
+
  LocalCoherenceLanczos(GridBase *FineGrid,
-		GridBase *CoarseGrid,
-		LinearOperatorBase<FineField> &FineOp,
-		int checkerboard) :
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
-    _Aggregate(CoarseGrid,FineGrid,checkerboard),
    _FineOp(FineOp),
-    _checkerboard(checkerboard)
+    _checkerboard(checkerboard),
+    evals_fine  (_evals_fine),
+    evals_coarse(_evals_coarse),
+    subspace    (_subspace),
+    evec_coarse(_evec_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
-  void Orthogonalise(void ) { _Aggregate.Orthogonalise(); }
+  //////////////////////////////////////////////////////////////////////////
+  // Alternate constructore, external storage for use by Hadrons module
+  //////////////////////////////////////////////////////////////////////////
+  LocalCoherenceLanczos(GridBase *FineGrid,
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard,
+			std::vector<FineField>   &ext_subspace,
+			std::vector<CoarseField> &ext_coarse,
+			std::vector<RealD>       &ext_eval_fine,
+			std::vector<RealD>       &ext_eval_coarse
+			) :
+    _CoarseGrid(CoarseGrid),
+    _FineGrid(FineGrid),
+    _FineOp(FineOp),
+    _checkerboard(checkerboard),
+    evals_fine  (ext_eval_fine), 
+    evals_coarse(ext_eval_coarse),
+    subspace    (ext_subspace),
+    evec_coarse (ext_coarse)
+  {
+    evals_fine.resize(0);
+    evals_coarse.resize(0);
+  };
+
+  void Orthogonalise(void ) {
+    CoarseScalar InnerProd(_CoarseGrid); 
+    blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+  };

  template<typename T>  static RealD normalise(T& v) 
  {
@@ -246,43 +296,44 @@ public:
    v = v * (1.0/nn);
    return nn;
  }
-
+  /*
  void fakeFine(void)
  {
    int Nk = nbasis;
-    _Aggregate.subspace.resize(Nk,_FineGrid);
-    _Aggregate.subspace[0]=1.0;
-    _Aggregate.subspace[0].checkerboard=_checkerboard;
-    normalise(_Aggregate.subspace[0]);
+    subspace.resize(Nk,_FineGrid);
+    subspace[0]=1.0;
+    subspace[0].checkerboard=_checkerboard;
+    normalise(subspace[0]);
    PlainHermOp<FineField>    Op(_FineOp);
    for(int k=1;k<Nk;k++){
-      _Aggregate.subspace[k].checkerboard=_checkerboard;
-      Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]);
-      normalise(_Aggregate.subspace[k]);
+      subspace[k].checkerboard=_checkerboard;
+      Op(subspace[k-1],subspace[k]);
+      normalise(subspace[k]);
    }
  }
+  */

  void testFine(RealD resid) 
  {
    assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
    PlainHermOp<FineField>    Op(_FineOp);
    ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
    for(int k=0;k<nbasis;k++){
-      assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1);
+      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
    }
  }

  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,subspace);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);

    for(int k=0;k<evec_coarse.size();k++){
      if ( k < nbasis ) { 
@@ -302,34 +353,34 @@ public:
    PlainHermOp<FineField>    Op(_FineOp);

    evals_fine.resize(Nm);
-    _Aggregate.subspace.resize(Nm,_FineGrid);
+    subspace.resize(Nm,_FineGrid);

    ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);

    FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;

    int Nconv;
-    IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
+    IRL.calc(evals_fine,subspace,src,Nconv,false);
    
    // Shrink down to number saved
    assert(Nstop>=nbasis);
    assert(Nconv>=nbasis);
    evals_fine.resize(nbasis);
-    _Aggregate.subspace.resize(nbasis,_FineGrid);
+    subspace.resize(nbasis,_FineGrid);
  }
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
    Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_Aggregate);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////

    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);

    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -107,7 +107,12 @@ namespace Grid {
    };

    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix, class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -129,7 +134,6 @@ namespace Grid {
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
-
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
    
      /////////////////////////////////////////////////////
@@ -146,6 +150,7 @@ namespace Grid {
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
+      guess(src_o,sol_o);
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;

@@ -189,7 +194,12 @@ namespace Grid {
    CBfactorise=cb;
  };
    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix, class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -225,6 +235,7 @@ namespace Grid {
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+      guess(src_o,sol_o);
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);

      ///////////////////////////////////////////////////
@@ -268,7 +279,12 @@ namespace Grid {
    };

    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix,class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -305,6 +321,7 @@ namespace Grid {
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+      guess(src_o,tmp);
      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);

@@ -347,7 +364,12 @@ namespace Grid {
    };

    template<class Matrix>
-      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    template<class Matrix, class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){

      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
@@ -385,6 +407,7 @@ namespace Grid {
      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 //      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
+      guess(src_o,tmp);
      _HermitianRBSolver(src_o,tmp);  assert(tmp.checkerboard==Odd);
      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);

--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -44,11 +44,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    assert (provided == MPI_THREAD_MULTIPLE);
+    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
+    if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
+        (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
+      assert(0);
  }

  Grid_quiesce_nodes();

+  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);

  GlobalSharedMemory::Init(communicator_world);
@@ -85,9 +89,17 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
 {
  MPI_Comm optimal_comm;
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm); // Remap using the shared memory optimising routine
+  ////////////////////////////////////////////////////
+  // Remap using the shared memory optimising routine
+  // The remap creates a comm which must be freed
+  ////////////////////////////////////////////////////
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
+  ///////////////////////////////////////////////////
+  // Free the temp communicator
+  ///////////////////////////////////////////////////
+  MPI_Comm_free(&optimal_comm);
 }

 //////////////////////////////////
@@ -183,8 +195,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,

  } else {
    srank = 0;
-    comm_split    = parent.communicator;
-    //    std::cout << " Inherited communicator " <<comm_split <<std::endl;
+    int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
+    assert(ierr==0);
  }

  //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -196,6 +208,11 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  // Take the right SHM buffers
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  SetCommunicator(comm_split);
+  
+  ///////////////////////////////////////////////
+  // Free the temp communicator 
+  ///////////////////////////////////////////////
+  MPI_Comm_free(&comm_split);

  if(0){ 
    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
@@ -210,6 +227,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,

 void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
 {
+  ////////////////////////////////////////////////////
+  // Creates communicator, and the communicator_halo
+  ////////////////////////////////////////////////////
  _ndimension = processors.size();
  _processor_coor.resize(_ndimension);

--- a/lib/communicator/SharedMemory.h
+++ b/lib/communicator/SharedMemory.h
@@ -133,6 +133,7 @@ class SharedMemory

 public:
  SharedMemory() {};
+  ~SharedMemory();
  ///////////////////////////////////////////////////////////////////////////////////////
  // set the buffers & sizes
  ///////////////////////////////////////////////////////////////////////////////////////
--- a/lib/communicator/SharedMemoryMPI.cc
+++ b/lib/communicator/SharedMemoryMPI.cc
@@ -182,6 +182,7 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
+  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -218,6 +219,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
+    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -232,6 +234,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
+  std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0); 
  MPI_Barrier(WorldShmComm);
@@ -259,7 +262,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      
-      if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
+      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
+      if ( ptr == (void * )MAP_FAILED ) {       
+	perror("failed mmap");     
+	assert(0);    
+      }
      assert(((uint64_t)ptr&0x3F)==0);
      
      WorldShmCommBufs[r] =ptr;
@@ -318,11 +325,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  heap_size = GlobalSharedMemory::ShmAllocBytes();
  for(int r=0;r<ShmSize;r++){

-    uint32_t sr = (r==ShmRank) ? GlobalSharedMemory::WorldRank : 0 ;
+    uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;

-    MPI_Allreduce(MPI_IN_PLACE,&sr,1,MPI_UINT32_T,MPI_SUM,comm);
+    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);

-    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[sr];
+    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
+    //    std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< "  wsr = "<<wsr<<std::endl;
  }
  ShmBufferFreeAll();

@@ -391,5 +399,9 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
    return (void *) remote;
  }
 }
+SharedMemory::~SharedMemory()
+{
+  MPI_Comm_free(&ShmComm);
+};

 }
--- a/lib/communicator/SharedMemoryNone.cc
+++ b/lib/communicator/SharedMemoryNone.cc
@@ -122,5 +122,7 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  return NULL;
 }
+SharedMemory::~SharedMemory()
+{};

 }
--- a/lib/log/Log.h
+++ b/lib/log/Log.h
@@ -86,7 +86,7 @@ protected:
  Colours &Painter;
  int active;
  int timing_mode;
-  int topWidth{-1};
+  int topWidth{-1}, chanWidth{-1};
  static int timestamp;
  std::string name, topName;
  std::string COLOUR;
@@ -126,6 +126,7 @@ public:
    }
  }
  void setTopWidth(const int w) {topWidth = w;}
+  void setChanWidth(const int w) {chanWidth = w;}

  friend std::ostream& operator<< (std::ostream& stream, Logger& log){

@@ -136,7 +137,12 @@ public:
        stream << std::setw(log.topWidth);
      }
      stream << log.topName << log.background()<< " : ";
-      stream << log.colour() <<  std::left << log.name << log.background() << " : ";
+      stream << log.colour() <<  std::left;
+      if (log.chanWidth > 0)
+      {
+        stream << std::setw(log.chanWidth);
+      }
+      stream << log.name << log.background() << " : ";
      if ( log.timestamp ) {
 	log.StopWatch->Stop();
 	GridTime now = log.StopWatch->Elapsed();
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -73,7 +73,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
  this->DW(psi,tmp_f,DaggerYes);

  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
  }
 }

--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -469,7 +469,7 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie
 	}
 	a0 = a0+incr;
 	a1 = a1+incr;
-	a2 = a2+sizeof(Simd::scalar_type);
+	a2 = a2+sizeof(typename Simd::scalar_type);
      }}
    {
      int lexa = s1+LLs*site;
@@ -701,7 +701,7 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
 	}
 	a0 = a0+incr;
 	a1 = a1+incr;
-	a2 = a2+sizeof(Simd::scalar_type);
+	a2 = a2+sizeof(typename Simd::scalar_type);
      }}
    {
      int lexa = s1+LLs*site;
--- a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
@@ -475,7 +475,7 @@ namespace QCD {
                        }
                        a0 = a0 + incr;
                        a1 = a1 + incr;
-                        a2 = a2 + sizeof(Simd::scalar_type);
+                        a2 = a2 + sizeof(typename Simd::scalar_type);
                    }
                }

--- a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
+++ b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
@@ -853,7 +853,7 @@ namespace QCD {

              a0 = a0 + incr;
              a1 = a1 + incr;
-              a2 = a2 + sizeof(Simd::scalar_type);
+              a2 = a2 + sizeof(typename Simd::scalar_type);
            }
          }

--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -556,7 +556,7 @@ namespace Optimization {
    v3  = _mm256_add_epi32(v1, v2);
    v1  = _mm256_hadd_epi32(v3, v3);
    v2  = _mm256_hadd_epi32(v1, v1);
-    u1  = _mm256_castsi256_si128(v2)        // upper half
+    u1  = _mm256_castsi256_si128(v2);        // upper half
    u2  = _mm256_extracti128_si256(v2, 1);  // lower half
    ret = _mm_add_epi32(u1, u2);
    return _mm_cvtsi128_si32(ret);
--- a/lib/simd/Intel512avx.h
+++ b/lib/simd/Intel512avx.h
@@ -79,7 +79,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZEND2f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp   ";\n"\
                                  "vsubps  " #tmp "," #Ciirr "," #Criir"{%k7}"  ";\n"

-#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\ 
+#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\
                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"

 #define ZEND2d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp   ";\n"\
--- a/lib/util/Profiling.h
+++ b/lib/util/Profiling.h
@@ -0,0 +1,72 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/util/Profiling.h
+
+    Copyright (C) 2018
+
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#ifndef GRID_PERF_PROFILING_H
+#define GRID_PERF_PROFILING_H
+
+#include <sstream>
+#include <iostream>
+#include <functional>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <signal.h>
+
+struct System
+{
+    static void profile(const std::string& name,std::function<void()> body) {
+        std::string filename = name.find(".data") == std::string::npos ? (name + ".data") : name;
+
+        // Launch profiler
+        pid_t pid;
+        std::stringstream s;
+        s << getpid();
+        pid = fork();
+        if (pid == 0) {
+            auto fd=open("/dev/null",O_RDWR);
+            dup2(fd,1);
+            dup2(fd,2);
+            exit(execl("/usr/bin/perf","perf","record","-o",filename.c_str(),"-p",s.str().c_str(),nullptr));
+        }
+
+        // Run body
+        body();
+
+        // Kill profiler  
+        kill(pid,SIGINT);
+        waitpid(pid,nullptr,0);
+    }
+
+    static void profile(std::function<void()> body) {
+        profile("perf.data",body);
+    }
+};
+
+#endif // GRID_PERF_PROFILING_H
--- a/tests/debug/Test_cayley_coarsen_support.cc
+++ b/tests/debug/Test_cayley_coarsen_support.cc
@@ -111,6 +111,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage<<"Error "<<norm2(err)<<std::endl;

  const int nbasis = 2;
+  const int cb = 0 ;
  LatticeFermion prom(FGrid);

  std::vector<LatticeFermion> subspace(nbasis,FGrid);
@@ -119,7 +120,7 @@ int main (int argc, char ** argv)

  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
-  Subspace Aggregates(Coarse5d,FGrid);
+  Subspace Aggregates(Coarse5d,FGrid,cb);
  Aggregates.CreateSubspaceRandom(RNG5);

  subspace=Aggregates.subspace;
--- a/tests/debug/Test_cayley_ldop_cr.cc
+++ b/tests/debug/Test_cayley_ldop_cr.cc
@@ -78,6 +78,7 @@ int main (int argc, char ** argv)

  RealD mass=0.1;
  RealD M5=1.5;
+  int cb=0;

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
@@ -95,7 +96,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
-  Subspace Aggregates(Coarse5d,FGrid);
+  Subspace Aggregates(Coarse5d,FGrid,cb);
  Aggregates.CreateSubspace(RNG5,HermDefOp);


--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
@@ -56,12 +56,12 @@ public:

  void checkpointFine(std::string evecs_file,std::string evals_file)
  {
-    assert(this->_Aggregate.subspace.size()==nbasis);
+    assert(this->subspace.size()==nbasis);
    emptyUserRecord record;
    Grid::QCD::ScidacWriter WR;
    WR.open(evecs_file);
    for(int k=0;k<nbasis;k++) {
-      WR.writeScidacFieldRecord(this->_Aggregate.subspace[k],record);
+      WR.writeScidacFieldRecord(this->subspace[k],record);
    }
    WR.close();
    
@@ -72,7 +72,7 @@ public:
  void checkpointFineRestore(std::string evecs_file,std::string evals_file)
  {
    this->evals_fine.resize(nbasis);
-    this->_Aggregate.subspace.resize(nbasis,this->_FineGrid);
+    this->subspace.resize(nbasis,this->_FineGrid);
    
    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evals from "<<evals_file<<std::endl;
    XmlReader RDx(evals_file);
@@ -85,8 +85,8 @@ public:
    Grid::QCD::ScidacReader RD ;
    RD.open(evecs_file);
    for(int k=0;k<nbasis;k++) {
-      this->_Aggregate.subspace[k].checkerboard=this->_checkerboard;
-      RD.readScidacFieldRecord(this->_Aggregate.subspace[k],record);
+      this->subspace[k].checkerboard=this->_checkerboard;
+      RD.readScidacFieldRecord(this->subspace[k],record);
      
    }
    RD.close();
@@ -221,7 +221,9 @@ int main (int argc, char ** argv) {
    std::cout << GridLogIRL<<"Checkpointing Fine evecs"<<std::endl;
    _LocalCoherenceLanczos.checkpointFine(std::string("evecs.scidac"),std::string("evals.xml"));
    _LocalCoherenceLanczos.testFine(fine.resid*100.0); // Coarse check
+    std::cout << GridLogIRL<<"Orthogonalising"<<std::endl;
    _LocalCoherenceLanczos.Orthogonalise();
+    std::cout << GridLogIRL<<"Orthogonaled"<<std::endl;
  }

  if ( Params.doFineRead ) { 
@@ -231,8 +233,6 @@ int main (int argc, char ** argv) {
  }

  if ( Params.doCoarse ) {
-    std::cout << GridLogMessage << "Orthogonalising " << nbasis<<" Nm "<<Nm2<< std::endl;
-    
    std::cout << GridLogMessage << "Performing coarse grid IRL Nstop "<< Ns2<< " Nk "<<Nk2<<" Nm "<<Nm2<< std::endl;
    _LocalCoherenceLanczos.calcCoarse(coarse.Cheby,Params.Smoother,Params.coarse_relax_tol,
 			      coarse.Nstop, coarse.Nk,coarse.Nm,