Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2025-10-29 19:14:33 +00:00 · 2018-10-08 10:12:11 +01:00
parent d9de8fd5c9 148fc052bd
commit dac9f8622e
84 changed files with 12100 additions and 4646 deletions
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -1,4 +1,9 @@
 #pragma once
 // Force Eigen to use MKL if Grid has been configured with --enable-mkl
 #ifdef USE_MKL
 #define EIGEN_USE_MKL_ALL
 #endif
 #if defined __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
--- a/Grid/json/json.hpp
+++ b/Grid/json/json.hpp
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -250,8 +250,7 @@ class GridLimeReader : public BinaryIO {
  ////////////////////////////////////////////
  // Read a generic serialisable object
  ////////////////////////////////////////////
-  template<class serialisable_object>
+  void readLimeObject(std::string &xmlstring,std::string record_name)
  void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
  {
    // should this be a do while; can we miss a first record??
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
@@ -266,15 +265,23 @@ class GridLimeReader : public BinaryIO {
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
 	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
-  std::string xmlstring(&xmlc[0]);
+   xmlstring = std::string(&xmlc[0]);
 	XmlReader RD(xmlstring, true, "");
 	read(RD,object_name,object);
 	return;
      }
    }  
    assert(0);
  }
  template<class serialisable_object>
  void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
  {
    std::string xmlstring;
    readLimeObject(xmlstring, record_name);
 	  XmlReader RD(xmlstring, true, "");
 	  read(RD,object_name,object);
  }
 };
 class GridLimeWriter : public BinaryIO 
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -141,7 +141,7 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
                GridRedBlackCartesian &Hgrid, RealD _mass, 
                const ImplParams &p = ImplParams(), 
                const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
-
+  
  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu);
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
--- a/Grid/qcd/utils/Utils.h
+++ b/Grid/qcd/utils/Utils.h
@@ -12,4 +12,10 @@
 #include <Grid/qcd/utils/SUnAdjoint.h>
 #include <Grid/qcd/utils/SUnTwoIndex.h>
 // All-to-all contraction kernels that touch the 
 // internal lattice structure
 #include <Grid/qcd/utils/A2Autils.h>
 #endif
--- a/Grid/serialisation/XmlIO.cc
+++ b/Grid/serialisation/XmlIO.cc
@@ -121,17 +121,26 @@ XmlReader::XmlReader(const std::string &s,  const bool isBuffer,
  }
 }
 #define XML_SAFE_NODE(expr)\
 if (expr)\
 {\
  node_ = expr;\
  return true;\
 }\
 else\
 {\
  return false;\
 }
 bool XmlReader::push(const std::string &s)
 {
-  if (node_.child(s.c_str()))
+  if (s.empty())
  {
-    node_ = node_.child(s.c_str());
+    XML_SAFE_NODE(node_.first_child());
    return true;
  }
  else
  {
-    return false;
+    XML_SAFE_NODE(node_.child(s.c_str()));
  }
 }
@@ -142,18 +151,26 @@ void XmlReader::pop(void)
 bool XmlReader::nextElement(const std::string &s)
 {
-  if (node_.next_sibling(s.c_str()))
+  if (s.empty())
  {
-    node_ = node_.next_sibling(s.c_str());
+    XML_SAFE_NODE(node_.next_sibling());
    return true;
  }
  else
  {
-    return false;
+    XML_SAFE_NODE(node_.next_sibling(s.c_str()));
  }
 }
 void XmlReader::readCurrentSubtree(std::string &s)
 {
  std::ostringstream oss; 
  pugi::xml_document doc;
  doc.append_copy(node_);
  doc.save(oss, indent_.c_str(), pugi::format_default | pugi::format_no_declaration);
  s = oss.str();
 }
 template <>
 void XmlReader::readDefault(const std::string &s, std::string &output)
 {
--- a/Grid/serialisation/XmlIO.h
+++ b/Grid/serialisation/XmlIO.h
@@ -72,16 +72,18 @@ namespace Grid
    XmlReader(const std::string &fileName, const bool isBuffer = false, 
              std::string toplev = std::string("grid") );
    virtual ~XmlReader(void) = default;
-    bool push(const std::string &s);
+    bool push(const std::string &s = "");
    void pop(void);
-    bool nextElement(const std::string &s);
+    bool nextElement(const std::string &s = "");
    template <typename U>
    void readDefault(const std::string &s, U &output);
    template <typename U>
    void readDefault(const std::string &s, std::vector<U> &output);
    void readCurrentSubtree(std::string &s);
  private:
    void checkParse(const pugi::xml_parse_result &result, const std::string name);
  private:
    const std::string  indent_{"  "};
    pugi::xml_document doc_;
    pugi::xml_node     node_;
    std::string        fileName_;
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -7,6 +7,7 @@ Source file: Hadrons/A2AMatrix.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,46 +30,136 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define A2A_Matrix_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
 #ifndef HADRONS_A2AM_NAME 
 #define HADRONS_A2AM_NAME "a2aMatrix"
 #endif
 #define HADRONS_A2AM_PARALLEL_IO
 BEGIN_HADRONS_NAMESPACE
-template <typename T, typename MetadataType>
+// general A2A matrix set based on Eigen tensors and Grid-allocated memory
 // Dimensions:
 //   0 - ext - external field (momentum, EM field, ...)
 //   1 - str - spin-color structure
 //   2 - t   - timeslice
 //   3 - i   - left  A2A mode index
 //   4 - j   - right A2A mode index
 template <typename T>
 using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
 /******************************************************************************
 *                      Abstract class for A2A kernels                        *
 ******************************************************************************/
 template <typename T, typename Field>
 class A2AKernel
 {
 public:
    A2AKernel(void) = default;
    virtual ~A2AKernel(void) = default;
    virtual void operator()(A2AMatrixSet<T> &m, const Field *left, const Field *right,
                          const unsigned int orthogDim, double &time) = 0;
    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
 };
 /******************************************************************************
 *                  Class to handle A2A matrix block HDF5 I/O                 *
 ******************************************************************************/
 template <typename T>
 class A2AMatrixIo
 {
 public:
    // constructors
    A2AMatrixIo(void) = default;
    A2AMatrixIo(std::string filename, std::string dataname, 
                const unsigned int nt, const unsigned int ni,
-                const unsigned int nj, const unsigned int blockSize);
+                const unsigned int nj);
    // destructor
    ~A2AMatrixIo(void) = default;
-    void initFile(const MetadataType &d);
+    // file allocation
-    void saveBlock(const T *data, const unsigned int i, const unsigned int j);
+    template <typename MetadataType>
    void initFile(const MetadataType &d, const unsigned int chunkSize);
    // block I/O
    void saveBlock(const T *data, const unsigned int i, const unsigned int j,
                   const unsigned int blockSizei, const unsigned int blockSizej);
    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
                   const unsigned int i, const unsigned int j);
 private:
    std::string  filename_, dataname_;
-    unsigned int nt_, ni_, nj_, blockSize_;
+    unsigned int nt_, ni_, nj_;
 };
-template <typename T, typename MetadataType>
+/******************************************************************************
-A2AMatrixIo<T, MetadataType>::A2AMatrixIo(std::string filename, 
+ *                  Wrapper for A2A matrix block computation                  *
-                                          std::string dataname, 
+ ******************************************************************************/
-                                          const unsigned int nt, 
+template <typename T, typename Field, typename MetadataType, typename TIo = T>
-                                          const unsigned int ni,
+class A2AMatrixBlockComputation
-                                          const unsigned int nj,
+{
-                                          const unsigned int blockSize)
+private:
    struct IoHelper
    {
        A2AMatrixIo<TIo> io;
        MetadataType     md;
        unsigned int     e, s, i, j;
    };
    typedef std::function<std::string(const unsigned int, const unsigned int)>  FilenameFn;
    typedef std::function<MetadataType(const unsigned int, const unsigned int)> MetadataFn;
 public:
    // constructor
    A2AMatrixBlockComputation(GridBase *grid,
                              const unsigned int orthogDim,
                              const unsigned int next,
                              const unsigned int nstr,
                              const unsigned int blockSize,
                              const unsigned int cacheBlockSize,
                              TimerArray *tArray = nullptr);
    // execution
    void execute(const std::vector<Field> &left, 
                 const std::vector<Field> &right,
                 A2AKernel<T, Field> &kernel,
                 const FilenameFn &ionameFn,
                 const FilenameFn &filenameFn,
                 const MetadataFn &metadataFn);
 private:
    // I/O handler
    void saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h);
 private:
    TimerArray            *tArray_;
    GridBase              *grid_;
    unsigned int          orthogDim_, nt_, next_, nstr_, blockSize_, cacheBlockSize_;
    Vector<T>             mCache_;
    Vector<TIo>           mBuf_;
    std::vector<IoHelper> nodeIo_;
 };
 /******************************************************************************
 *                     A2AMatrixIo template implementation                    *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename T>
 A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname, 
                            const unsigned int nt, const unsigned int ni,
                            const unsigned int nj)
 : filename_(filename), dataname_(dataname)
-, nt_(nt), ni_(ni), nj_(nj), blockSize_(blockSize)
+, nt_(nt), ni_(ni), nj_(nj)
 {}
-template <typename T, typename MetadataType>
+// file allocation /////////////////////////////////////////////////////////////
-void A2AMatrixIo<T, MetadataType>::initFile(const MetadataType &d)
+template <typename T>
 template <typename MetadataType>
 void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSize)
 {
 #ifdef HAVE_HDF5
    std::vector<hsize_t>    dim = {static_cast<hsize_t>(nt_), 
                                   static_cast<hsize_t>(ni_), 
                                   static_cast<hsize_t>(nj_)},
                            chunk = {static_cast<hsize_t>(nt_), 
-                                     static_cast<hsize_t>(blockSize_), 
+                                     static_cast<hsize_t>(chunkSize), 
-                                     static_cast<hsize_t>(blockSize_)};
+                                     static_cast<hsize_t>(chunkSize)};
    H5NS::DataSpace         dataspace(dim.size(), dim.data());
    H5NS::DataSet           dataset;
    H5NS::DSetCreatPropList plist;
@@ -85,20 +176,23 @@ void A2AMatrixIo<T, MetadataType>::initFile(const MetadataType &d)
    push(reader, dataname_);
    auto &group = reader.getGroup();
    plist.setChunk(chunk.size(), chunk.data());
-    dataset = group.createDataSet("data", Hdf5Type<T>::type(), dataspace, plist);
+    dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type<T>::type(), dataspace, plist);
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }
-template <typename T, typename MetadataType>
+// block I/O ///////////////////////////////////////////////////////////////////
-void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data, 
+template <typename T>
-                                             const unsigned int i, 
+void A2AMatrixIo<T>::saveBlock(const T *data, 
-                                             const unsigned int j)
+                               const unsigned int i, 
                               const unsigned int j,
                               const unsigned int blockSizei,
                               const unsigned int blockSizej)
 {
 #ifdef HAVE_HDF5
    Hdf5Reader           reader(filename_);
-    std::vector<hsize_t> count = {nt_, blockSize_, blockSize_},
+    std::vector<hsize_t> count = {nt_, blockSizei, blockSizej},
                         offset = {0, static_cast<hsize_t>(i),
                                   static_cast<hsize_t>(j)},
                         stride = {1, 1, 1},
@@ -109,7 +203,7 @@ void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data,
    push(reader, dataname_);
    auto &group = reader.getGroup();
-    dataset     = group.openDataSet("data");
+    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
    dataspace   = dataset.getSpace();
    dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
                              stride.data(), block.data());
@@ -119,6 +213,193 @@ void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data,
 #endif
 }
 template <typename T>
 void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
                               const unsigned int ext, const unsigned int str,
                               const unsigned int i, const unsigned int j)
 {
    unsigned int blockSizei = m.dimension(3);
    unsigned int blockSizej = m.dimension(4);
    unsigned int nstr       = m.dimension(1);
    size_t       offset     = (ext*nstr + str)*nt_*blockSizei*blockSizej;
    saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
 }
 /******************************************************************************
 *               A2AMatrixBlockComputation template implementation            *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename T, typename Field, typename MetadataType, typename TIo>
 A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
 ::A2AMatrixBlockComputation(GridBase *grid,
                            const unsigned int orthogDim,
                            const unsigned int next, 
                            const unsigned int nstr,
                            const unsigned int blockSize, 
                            const unsigned int cacheBlockSize,
                            TimerArray *tArray)
 : grid_(grid), nt_(grid->GlobalDimensions()[orthogDim]), orthogDim_(orthogDim)
 , next_(next), nstr_(nstr), blockSize_(blockSize), cacheBlockSize_(cacheBlockSize)
 , tArray_(tArray)
 {
    mCache_.resize(nt_*next_*nstr_*cacheBlockSize_*cacheBlockSize_);
    mBuf_.resize(nt_*next_*nstr_*blockSize_*blockSize_);
 }
 #define START_TIMER(name) if (tArray_) tArray_->startTimer(name)
 #define STOP_TIMER(name)  if (tArray_) tArray_->stopTimer(name)
 #define GET_TIMER(name)   ((tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.)
 // execution ///////////////////////////////////////////////////////////////////
 template <typename T, typename Field, typename MetadataType, typename TIo>
 void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
 ::execute(const std::vector<Field> &left, const std::vector<Field> &right,
          A2AKernel<T, Field> &kernel, const FilenameFn &ionameFn,
          const FilenameFn &filenameFn, const MetadataFn &metadataFn)
 {
    //////////////////////////////////////////////////////////////////////////
    // i,j   is first  loop over blockSize_ factors
    // ii,jj is second loop over cacheBlockSize_ factors for high perf contractions
    // iii,jjj are loops within cacheBlock
    // Total index is sum of these  i+ii+iii etc...
    //////////////////////////////////////////////////////////////////////////
    int    N_i = left.size();
    int    N_j = right.size();
    double flops, bytes, t_kernel;
    double nodes = grid_->NodeCount();
    int NBlock_i = N_i/blockSize_ + (((N_i % blockSize_) != 0) ? 1 : 0);
    int NBlock_j = N_j/blockSize_ + (((N_j % blockSize_) != 0) ? 1 : 0);
    for(int i=0;i<N_i;i+=blockSize_)
    for(int j=0;j<N_j;j+=blockSize_)
    {
        // Get the W and V vectors for this block^2 set of terms
        int N_ii = MIN(N_i-i,blockSize_);
        int N_jj = MIN(N_j-j,blockSize_);
        A2AMatrixSet<TIo> mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj);
        LOG(Message) << "All-to-all matrix block " 
                     << j/blockSize_ + NBlock_j*i/blockSize_ + 1 
                     << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
                     << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
                     << std::endl;
        // Series of cache blocked chunks of the contractions within this block
        flops    = 0.0;
        bytes    = 0.0;
        t_kernel = 0.0;
        for(int ii=0;ii<N_ii;ii+=cacheBlockSize_)
        for(int jj=0;jj<N_jj;jj+=cacheBlockSize_)
        {
            double t;
            int N_iii = MIN(N_ii-ii,cacheBlockSize_);
            int N_jjj = MIN(N_jj-jj,cacheBlockSize_);
            A2AMatrixSet<T> mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj);
            START_TIMER("kernel");
            kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, t);
            STOP_TIMER("kernel");
            t_kernel += t;
            flops    += kernel.flops(N_iii, N_jjj);
            bytes    += kernel.bytes(N_iii, N_jjj);
            START_TIMER("cache copy");
            parallel_for_nest5(int e =0;e<next_;e++)
            for(int s =0;s< nstr_;s++)
            for(int t =0;t< nt_;t++)
            for(int iii=0;iii< N_iii;iii++)
            for(int jjj=0;jjj< N_jjj;jjj++)
            {
                mBlock(e,s,t,ii+iii,jj+jjj) = mCacheBlock(e,s,t,iii,jjj);
            }
            STOP_TIMER("cache copy");
        }
        // perf
        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
                     << " Gflop/s/node " << std::endl;
        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
                     << " GB/s/node "  << std::endl;
        // IO
        double       blockSize, ioTime;
        unsigned int myRank = grid_->ThisRank(), nRank  = grid_->RankCount();
        LOG(Message) << "Writing block to disk" << std::endl;
        ioTime = -GET_TIMER("IO: write block");
        START_TIMER("IO: total");
        makeFileDir(filenameFn(0, 0), grid_);
 #ifdef HADRONS_A2AM_PARALLEL_IO
        grid_->Barrier();
        // make task list for current node
        nodeIo_.clear();
        for(int f = myRank; f < next_*nstr_; f += nRank)
        {
            IoHelper h;
            h.i  = i;
            h.j  = j;
            h.e  = f/nstr_;
            h.s  = f % nstr_;
            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
            h.md = metadataFn(h.e, h.s);
            nodeIo_.push_back(h);
        }
        // parallel IO
        for (auto &h: nodeIo_)
        {
            saveBlock(mBlock, h);
        }
        grid_->Barrier();
 #else
        // serial IO, for testing purposes only
        for(int e = 0; e < next_; e++)
        for(int s = 0; s < nstr_; s++)
        {
            IoHelper h;
            h.i  = i;
            h.j  = j;
            h.e  = e;
            h.s  = s;
            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
            h.md = metadataFn(h.e, h.s);
            saveBlock(mfBlock, h);
        }
 #endif
        STOP_TIMER("IO: total");
        blockSize  = static_cast<double>(next_*nstr_*nt_*N_ii*N_jj*sizeof(TIo));
        ioTime    += GET_TIMER("IO: write block");
        LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
                     << ioTime  << " us (" 
                     << blockSize/ioTime*1.0e6/1024/1024
                     << " MB/s)" << std::endl;
    }
 }
 // I/O handler /////////////////////////////////////////////////////////////////
 template <typename T, typename Field, typename MetadataType, typename TIo>
 void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
 ::saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h)
 {
    if ((h.i == 0) and (h.j == 0))
    {
        START_TIMER("IO: file creation");
        h.io.initFile(h.md, blockSize_);
        STOP_TIMER("IO: file creation");
    }
    START_TIMER("IO: write block");
    h.io.saveBlock(m, h.e, h.s, h.i, h.j);
    STOP_TIMER("IO: write block");
 }
 #undef START_TIMER
 #undef STOP_TIMER
 #undef GET_TIMER
 END_HADRONS_NAMESPACE
 #endif // A2A_Matrix_hpp_
--- a/Hadrons/EigenPack.hpp
+++ b/Hadrons/EigenPack.hpp
@@ -39,230 +39,328 @@ BEGIN_HADRONS_NAMESPACE
 #define HADRONS_DEFAULT_LANCZOS_NBASIS 60
 #endif
 #define HADRONS_DUMP_EP_METADATA(record) \
 LOG(Message) << "Eigenpack metadata:" << std::endl;\
 LOG(Message) << "* operator" << std::endl;\
 LOG(Message) << (record).operatorXml << std::endl;\
 LOG(Message) << "* solver" << std::endl;\
 LOG(Message) << (record).solverXml << std::endl;
 struct PackRecord
 {
    std::string operatorXml, solverXml;
 };
 struct VecRecord: Serializable
 {
    GRID_SERIALIZABLE_CLASS_MEMBERS(VecRecord,
                                    unsigned int, index,
                                    double,       eval);
    VecRecord(void): index(0), eval(0.) {}
 };
 namespace EigenPackIo
 {
    inline void readHeader(PackRecord &record, ScidacReader &binReader)
    {
        std::string recordXml;
        binReader.readLimeObject(recordXml, SCIDAC_FILE_XML);
        XmlReader xmlReader(recordXml, true, "eigenPackPar");
        xmlReader.push();
        xmlReader.readCurrentSubtree(record.operatorXml);
        xmlReader.nextElement();
        xmlReader.readCurrentSubtree(record.solverXml);
    }
    template <typename T, typename TIo = T>
    void readElement(T &evec, RealD &eval, const unsigned int index,
                     ScidacReader &binReader, TIo *ioBuf = nullptr)
    {
        VecRecord vecRecord;
        LOG(Message) << "Reading eigenvector " << index << std::endl;
        if (ioBuf == nullptr)
        {
            binReader.readScidacFieldRecord(evec, vecRecord);
        }
        else
        {
            binReader.readScidacFieldRecord(*ioBuf, vecRecord);
            precisionChange(evec, *ioBuf);
        }
        if (vecRecord.index != index)
        {
            HADRONS_ERROR(Io, "Eigenvector " + std::to_string(index) + " has a"
                            + " wrong index (expected " + std::to_string(vecRecord.index) 
                            + ")");
        }
        eval = vecRecord.eval;
    }
    template <typename T, typename TIo = T>
    static void readPack(std::vector<T> &evec, std::vector<RealD> &eval,
                         PackRecord &record, const std::string filename, 
                         const unsigned int size, bool multiFile, 
                         GridBase *gridIo = nullptr)
    {
        std::unique_ptr<TIo> ioBuf{nullptr};
        ScidacReader         binReader;
        if (typeHash<T>() != typeHash<TIo>())
        {
            if (gridIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "I/O type different from vector type but null I/O grid passed");
            }
            ioBuf.reset(new TIo(gridIo));
        }
        if (multiFile)
        {
            std::string fullFilename;
            for(int k = 0; k < size; ++k) 
            {
                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
                binReader.open(fullFilename);
                readHeader(record, binReader);
                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
                binReader.close();
            }
        }
        else
        {
            binReader.open(filename);
            readHeader(record, binReader);
            for(int k = 0; k < size; ++k) 
            {
                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
            }
            binReader.close();
        }
    }
    inline void writeHeader(ScidacWriter &binWriter, PackRecord &record)
    {
        XmlWriter xmlWriter("", "eigenPackPar");
        xmlWriter.pushXmlString(record.operatorXml);
        xmlWriter.pushXmlString(record.solverXml);
        binWriter.writeLimeObject(1, 1, xmlWriter, "parameters", SCIDAC_FILE_XML);
    }
    template <typename T, typename TIo = T>
    void writeElement(ScidacWriter &binWriter, T &evec, RealD &eval, 
                      const unsigned int index, TIo *ioBuf, 
                      T *testBuf = nullptr)
    {
        VecRecord vecRecord;
        LOG(Message) << "Writing eigenvector " << index << std::endl;
        vecRecord.eval  = eval;
        vecRecord.index = index;
        if ((ioBuf == nullptr) || (testBuf == nullptr))
        {
            binWriter.writeScidacFieldRecord(evec, vecRecord, DEFAULT_ASCII_PREC);
        }
        else
        {
            precisionChange(*ioBuf, evec);
            precisionChange(*testBuf, *ioBuf);
            *testBuf -= evec;
            LOG(Message) << "Precision diff norm^2 " << norm2(*testBuf) << std::endl;
            binWriter.writeScidacFieldRecord(*ioBuf, vecRecord, DEFAULT_ASCII_PREC);
        }   
    }
    template <typename T, typename TIo = T>
    static void writePack(const std::string filename, std::vector<T> &evec, 
                          std::vector<RealD> &eval, PackRecord &record, 
                          const unsigned int size, bool multiFile, 
                          GridBase *gridIo = nullptr)
    {
        GridBase             *grid = evec[0]._grid;
        std::unique_ptr<TIo> ioBuf{nullptr}; 
        std::unique_ptr<T>   testBuf{nullptr};
        ScidacWriter         binWriter(grid->IsBoss());
        if (typeHash<T>() != typeHash<TIo>())
        {
            if (gridIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "I/O type different from vector type but null I/O grid passed");
            }
            ioBuf.reset(new TIo(gridIo));
            testBuf.reset(new T(grid));
        }
        if (multiFile)
        {
            std::string fullFilename;
            for(int k = 0; k < size; ++k) 
            {
                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
                makeFileDir(fullFilename, grid);
                binWriter.open(fullFilename);
                writeHeader(binWriter, record);
                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
                binWriter.close();
            }
        }
        else
        {
            makeFileDir(filename, grid);
            binWriter.open(filename);
            writeHeader(binWriter, record);
            for(int k = 0; k < size; ++k) 
            {
                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
            }
            binWriter.close();
        }
    }
 }
 template <typename F>
-class EigenPack
+class BaseEigenPack
 {
 public:
    typedef F Field;
    struct PackRecord
    {
        std::string operatorXml, solverXml;
    };
    struct VecRecord: Serializable
    {
        GRID_SERIALIZABLE_CLASS_MEMBERS(VecRecord,
                                        unsigned int, index,
                                        double,       eval);
        VecRecord(void): index(0), eval(0.) {}
    };
 public:
    std::vector<RealD> eval;
    std::vector<F>     evec;
    PackRecord         record;
 public:
-    EigenPack(void)          = default;
+    BaseEigenPack(void)          = default;
-    virtual ~EigenPack(void) = default;
+    BaseEigenPack(const size_t size, GridBase *grid)
    EigenPack(const size_t size, GridBase *grid)
    {
        resize(size, grid);
    }
-
+    virtual ~BaseEigenPack(void) = default;
    void resize(const size_t size, GridBase *grid)
    {
        eval.resize(size);
        evec.resize(size, grid);
    }
 };
 template <typename F, typename FIo = F>
 class EigenPack: public BaseEigenPack<F>
 {
 public:
    typedef F   Field;
    typedef FIo FieldIo;
 public:
    EigenPack(void)          = default;
    virtual ~EigenPack(void) = default;
    EigenPack(const size_t size, GridBase *grid, GridBase *gridIo = nullptr)
    : BaseEigenPack<F>(size, grid)
    {
        if (typeHash<F>() != typeHash<FIo>())
        {
            if (gridIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "I/O type different from vector type but null I/O grid passed");
            }
        }
        gridIo_ = gridIo;
    }
    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
+        EigenPackIo::readPack<F, FIo>(this->evec, this->eval, this->record, 
-        {
+                                      evecFilename(fileStem, traj, multiFile), 
-            for(int k = 0; k < evec.size(); ++k)
+                                      this->evec.size(), multiFile, gridIo_);
-            {
+        HADRONS_DUMP_EP_METADATA(this->record);
                basicReadSingle(evec[k], eval[k], evecFilename(fileStem, k, traj), k);
            }
        }
        else
        {
            basicRead(evec, eval, evecFilename(fileStem, -1, traj), evec.size());
        }
    }
    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
+        EigenPackIo::writePack<F, FIo>(evecFilename(fileStem, traj, multiFile), 
-        {
+                                       this->evec, this->eval, this->record, 
-            for(int k = 0; k < evec.size(); ++k)
+                                       this->evec.size(), multiFile, gridIo_);
            {
                basicWriteSingle(evecFilename(fileStem, k, traj), evec[k], eval[k], k);
            }
        }
        else
        {
            basicWrite(evecFilename(fileStem, -1, traj), evec, eval, evec.size());
        }
    }
 protected:
-    std::string evecFilename(const std::string stem, const int vec, const int traj)
+    std::string evecFilename(const std::string stem, const int traj, const bool multiFile)
    {
        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
-        if (vec == -1)
+        if (multiFile)
        {
-            return stem + t + ".bin";
+            return stem + t;
        }
        else
        {
-            return stem + t + "/v" + std::to_string(vec) + ".bin";
+            return stem + t + ".bin";
        }
    }
-
+protected:
-    template <typename T>
+    GridBase *gridIo_;
    void basicRead(std::vector<T> &evec, std::vector<RealD> &eval,
                   const std::string filename, const unsigned int size)
    {
        ScidacReader    binReader;
        binReader.open(filename);
        binReader.skipPastObjectRecord(SCIDAC_FILE_XML);
        for(int k = 0; k < size; ++k) 
        {
            VecRecord vecRecord;
            LOG(Message) << "Reading eigenvector " << k << std::endl;
            binReader.readScidacFieldRecord(evec[k], vecRecord);
            if (vecRecord.index != k)
            {
                HADRONS_ERROR(Io, "Eigenvector " + std::to_string(k) + " has a"
                              + " wrong index (expected " + std::to_string(vecRecord.index) 
                              + ") in file '" + filename + "'");
            }
            eval[k] = vecRecord.eval;
        }
        binReader.close();
    }
    template <typename T>
    void basicReadSingle(T &evec, RealD &eval, const std::string filename, 
                         const unsigned int index)
    {
        ScidacReader binReader;
        VecRecord    vecRecord;
        binReader.open(filename);
        binReader.skipPastObjectRecord(SCIDAC_FILE_XML);
        LOG(Message) << "Reading eigenvector " << index << std::endl;
        binReader.readScidacFieldRecord(evec, vecRecord);
        if (vecRecord.index != index)
        {
            HADRONS_ERROR(Io, "Eigenvector " + std::to_string(index) + " has a"
                          + " wrong index (expected " + std::to_string(vecRecord.index) 
                          + ") in file '" + filename + "'");
        }
        eval = vecRecord.eval;
        binReader.close();
    }
    template <typename T>
    void basicWrite(const std::string filename, std::vector<T> &evec, 
                    const std::vector<RealD> &eval, const unsigned int size)
    {
        ScidacWriter binWriter(evec[0]._grid->IsBoss());
        XmlWriter    xmlWriter("", "eigenPackPar");
        makeFileDir(filename, evec[0]._grid);
        xmlWriter.pushXmlString(record.operatorXml);
        xmlWriter.pushXmlString(record.solverXml);
        binWriter.open(filename);
        binWriter.writeLimeObject(1, 1, xmlWriter, "parameters", SCIDAC_FILE_XML);
        for(int k = 0; k < size; ++k) 
        {
            VecRecord vecRecord;
            vecRecord.index = k;
            vecRecord.eval  = eval[k];
            LOG(Message) << "Writing eigenvector " << k << std::endl;
            binWriter.writeScidacFieldRecord(evec[k], vecRecord, DEFAULT_ASCII_PREC);
        }
        binWriter.close();
    }
    template <typename T>
    void basicWriteSingle(const std::string filename, T &evec, 
                          const RealD eval, const unsigned int index)
    {
        ScidacWriter binWriter(evec._grid->IsBoss());
        XmlWriter    xmlWriter("", "eigenPackPar");
        VecRecord    vecRecord;
        makeFileDir(filename, evec._grid);
        xmlWriter.pushXmlString(record.operatorXml);
        xmlWriter.pushXmlString(record.solverXml);
        binWriter.open(filename);
        binWriter.writeLimeObject(1, 1, xmlWriter, "parameters", SCIDAC_FILE_XML);
        vecRecord.index = index;
        vecRecord.eval  = eval;
        LOG(Message) << "Writing eigenvector " << index << std::endl;
        binWriter.writeScidacFieldRecord(evec, vecRecord, DEFAULT_ASCII_PREC);
        binWriter.close();
    }
 };
-template <typename FineF, typename CoarseF>
+template <typename FineF, typename CoarseF, 
-class CoarseEigenPack: public EigenPack<FineF>
+          typename FineFIo = FineF, typename CoarseFIo = CoarseF>
 class CoarseEigenPack: public EigenPack<FineF, FineFIo>
 {
 public:
-    typedef CoarseF CoarseField;
+    typedef CoarseF CoarseField;         
 public:
    std::vector<RealD>   evalCoarse;
    std::vector<CoarseF> evecCoarse;
    std::vector<RealD>   evalCoarse;
 public:
    CoarseEigenPack(void)          = default;
    virtual ~CoarseEigenPack(void) = default;
    CoarseEigenPack(const size_t sizeFine, const size_t sizeCoarse, 
-                    GridBase *gridFine, GridBase *gridCoarse)
+                    GridBase *gridFine, GridBase *gridCoarse,
                    GridBase *gridFineIo = nullptr, 
                    GridBase *gridCoarseIo = nullptr)
    {
        if (typeHash<FineF>() != typeHash<FineFIo>())
        {
            if (gridFineIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "Fine I/O type different from vector type but null fine I/O grid passed");
            }
        }
        if (typeHash<CoarseF>() != typeHash<CoarseFIo>())
        {
            if (gridCoarseIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "Coarse I/O type different from vector type but null coarse I/O grid passed");
            }
        }
        this->gridIo_ = gridFineIo;
        gridCoarseIo_ = gridCoarseIo;
        resize(sizeFine, sizeCoarse, gridFine, gridCoarse);
    }
    void resize(const size_t sizeFine, const size_t sizeCoarse, 
                GridBase *gridFine, GridBase *gridCoarse)
    {
-        EigenPack<FineF>::resize(sizeFine, gridFine);
+        EigenPack<FineF, FineFIo>::resize(sizeFine, gridFine);
        evalCoarse.resize(sizeCoarse);
        evecCoarse.resize(sizeCoarse, gridCoarse);
    }
    void readFine(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
+        EigenPack<FineF, FineFIo>::read(fileStem + "_fine", multiFile, traj);
        {
            for(int k = 0; k < this->evec.size(); ++k)
            {
                this->basicReadSingle(this->evec[k], this->eval[k], this->evecFilename(fileStem + "_fine", k, traj), k);
            }
        }
        else
        {
            this->basicRead(this->evec, this->eval, this->evecFilename(fileStem + "_fine", -1, traj), this->evec.size());
        }
    }
    void readCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
+        PackRecord dummy;
-        {
+
-            for(int k = 0; k < evecCoarse.size(); ++k)
+        EigenPackIo::readPack<CoarseF, CoarseFIo>(evecCoarse, evalCoarse, dummy, 
-            {
+                              this->evecFilename(fileStem + "_coarse", traj, multiFile), 
-                this->basicReadSingle(evecCoarse[k], evalCoarse[k], this->evecFilename(fileStem + "_coarse", k, traj), k);
+                              evecCoarse.size(), multiFile, gridCoarseIo_);
            }
        }
        else
        {
            this->basicRead(evecCoarse, evalCoarse, this->evecFilename(fileStem + "_coarse", -1, traj), evecCoarse.size());
        }
    }
    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
@@ -273,32 +371,14 @@ public:
    void writeFine(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
+        EigenPack<FineF, FineFIo>::write(fileStem + "_fine", multiFile, traj);
        {
            for(int k = 0; k < this->evec.size(); ++k)
            {
                this->basicWriteSingle(this->evecFilename(fileStem + "_fine", k, traj), this->evec[k], this->eval[k], k);
            }
        }
        else
        {
            this->basicWrite(this->evecFilename(fileStem + "_fine", -1, traj), this->evec, this->eval, this->evec.size());
        }
    }
    void writeCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
+        EigenPackIo::writePack<CoarseF, CoarseFIo>(this->evecFilename(fileStem + "_coarse", traj, multiFile), 
-        {
+                                                   evecCoarse, evalCoarse, this->record, 
-            for(int k = 0; k < evecCoarse.size(); ++k)
+                                                   evecCoarse.size(), multiFile, gridCoarseIo_);
            {
                this->basicWriteSingle(this->evecFilename(fileStem + "_coarse", k, traj), evecCoarse[k], evalCoarse[k], k);
            }
        }
        else
        {
            this->basicWrite(this->evecFilename(fileStem + "_coarse", -1, traj), evecCoarse, evalCoarse, evecCoarse.size());
        }
    }
    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
@@ -306,18 +386,29 @@ public:
        writeFine(fileStem, multiFile, traj);
        writeCoarse(fileStem, multiFile, traj);
    }
 private:
    GridBase *gridCoarseIo_;
 };
 template <typename FImpl>
-using FermionEigenPack = EigenPack<typename FImpl::FermionField>;
+using BaseFermionEigenPack = BaseEigenPack<typename FImpl::FermionField>;
-template <typename FImpl, int nBasis>
+template <typename FImpl, typename FImplIo = FImpl>
 using FermionEigenPack = EigenPack<typename FImpl::FermionField, typename FImplIo::FermionField>;
 template <typename FImpl, int nBasis, typename FImplIo = FImpl>
 using CoarseFermionEigenPack = CoarseEigenPack<
    typename FImpl::FermionField,
    typename LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
                                   typename FImpl::SiteComplex, 
                                   nBasis>::CoarseField,
    typename FImplIo::FermionField,
    typename LocalCoherenceLanczos<typename FImplIo::SiteSpinor, 
                                   typename FImplIo::SiteComplex, 
                                   nBasis>::CoarseField>;
 #undef HADRONS_DUMP_EP_METADATA
 END_HADRONS_NAMESPACE
 #endif // Hadrons_EigenPack_hpp_
--- a/Hadrons/Environment.cc
+++ b/Hadrons/Environment.cc
@@ -45,19 +45,13 @@ Environment::Environment(void)
 {
    dim_         = GridDefaultLatt();
    nd_          = dim_.size();
-    defaultGrid_ = {typeHash<vComplex>(), 1};
+    createGrid<vComplex>(1);
    grid4d_[defaultGrid_].reset(
        SpaceTimeGrid::makeFourDimGrid(dim_, 
                                       GridDefaultSimd(nd_, vComplex::Nsimd()),
                                       GridDefaultMpi()));
    gridRb4d_[defaultGrid_].reset(
        SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_[defaultGrid_].get()));
    vol_ = 1.;
    for (auto d: dim_)
    {
        vol_ *= d;
    }
-    rng4d_.reset(new GridParallelRNG(grid4d_[defaultGrid_].get()));
+    rng4d_.reset(new GridParallelRNG(getGrid()));
 }
 // grids ///////////////////////////////////////////////////////////////////////
--- a/Hadrons/Environment.hpp
+++ b/Hadrons/Environment.hpp
@@ -94,20 +94,20 @@ public:
    void                    createGrid(const unsigned int Ls);
    template <typename VType = vComplex>
    void                    createCoarseGrid(const std::vector<int> &blockSize,
-                                             const unsigned int Ls = 1);
+                                             const unsigned int Ls);
    template <typename VType = vComplex>
-    GridCartesian *         getGrid(void) const;
+    GridCartesian *         getGrid(void);
    template <typename VType = vComplex>
-    GridRedBlackCartesian * getRbGrid(void) const;
+    GridRedBlackCartesian * getRbGrid(void);
    template <typename VType = vComplex>
-    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize) const;
+    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize);
    template <typename VType = vComplex>
-    GridCartesian *         getGrid(const unsigned int Ls) const;
+    GridCartesian *         getGrid(const unsigned int Ls);
    template <typename VType = vComplex>
-    GridRedBlackCartesian * getRbGrid(const unsigned int Ls) const;
+    GridRedBlackCartesian * getRbGrid(const unsigned int Ls);
    template <typename VType = vComplex>
    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize,
-                                          const unsigned int Ls) const;
+                                          const unsigned int Ls);
    std::vector<int>        getDim(void) const;
    int                     getDim(const unsigned int mu) const;
    unsigned int            getNd(void) const;
@@ -174,7 +174,6 @@ private:
    bool                                protect_{true};
    // grids
    std::vector<int>                    dim_;
    FineGridKey                         defaultGrid_;
    std::map<FineGridKey, GridPt>       grid4d_;
    std::map<FineGridKey, GridPt>       grid5d_;
    std::map<FineGridKey, GridRbPt>     gridRb4d_;
@@ -221,6 +220,14 @@ void Holder<T>::reset(T *pt)
 *                     Environment template implementation                    *
 ******************************************************************************/
 // grids ///////////////////////////////////////////////////////////////////////
 #define HADRONS_DUMP_GRID(...)\
 LOG(Debug) << "New grid " << (__VA_ARGS__) << std::endl;\
 LOG(Debug) << " - cb  : " << (__VA_ARGS__)->_isCheckerBoarded << std::endl;\
 LOG(Debug) << " - fdim: " << (__VA_ARGS__)->_fdimensions << std::endl;\
 LOG(Debug) << " - gdim: " << (__VA_ARGS__)->_gdimensions << std::endl;\
 LOG(Debug) << " - ldim: " << (__VA_ARGS__)->_ldimensions << std::endl;\
 LOG(Debug) << " - rdim: " << (__VA_ARGS__)->_rdimensions << std::endl;
 template <typename VType>
 void Environment::createGrid(const unsigned int Ls)
 {
@@ -232,15 +239,19 @@ void Environment::createGrid(const unsigned int Ls)
            SpaceTimeGrid::makeFourDimGrid(getDim(), 
                                        GridDefaultSimd(getNd(), VType::Nsimd()),
                                        GridDefaultMpi()));
        HADRONS_DUMP_GRID(grid4d_[{hash, 1}].get());
        gridRb4d_[{hash, 1}].reset(
            SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_[{hash, 1}].get()));
        HADRONS_DUMP_GRID(gridRb4d_[{hash, 1}].get());
    }
    if (grid5d_.find({hash, Ls}) == grid5d_.end())
    {
        auto g = grid4d_[{hash, 1}].get();
        grid5d_[{hash, Ls}].reset(SpaceTimeGrid::makeFiveDimGrid(Ls, g));
        HADRONS_DUMP_GRID(grid5d_[{hash, Ls}].get());
        gridRb5d_[{hash, Ls}].reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, g));
        HADRONS_DUMP_GRID(gridRb5d_[{hash, Ls}].get());
    }
 }
@@ -290,18 +301,24 @@ void Environment::createCoarseGrid(const std::vector<int> &blockSize,
        gridCoarse4d_[hkey4d].reset(
            SpaceTimeGrid::makeFourDimGrid(coarseDim, 
                GridDefaultSimd(nd, VType::Nsimd()), GridDefaultMpi()));
        HADRONS_DUMP_GRID(gridCoarse4d_[hkey4d].get());
    }
    if (gridCoarse5d_.find(hkey5d) == gridCoarse5d_.end())
    {
        gridCoarse5d_[hkey5d].reset(
            SpaceTimeGrid::makeFiveDimGrid(cLs, gridCoarse4d_[hkey4d].get()));
        HADRONS_DUMP_GRID(gridCoarse5d_[hkey5d].get());
    }
 }
 #undef HADRONS_DUMP_GRID
 template <typename VType>
-GridCartesian * Environment::getGrid(void) const
+GridCartesian * Environment::getGrid(void)
 {
-    auto it = grid4d_.find({typeHash<VType>(), 1});
+    FineGridKey key = {typeHash<VType>(), 1};
    auto it = grid4d_.find(key);
    if (it != grid4d_.end())
    {
@@ -309,15 +326,17 @@ GridCartesian * Environment::getGrid(void) const
    }
    else
    {
-        HADRONS_ERROR(Definition, "no 4D grid for SIMD type '" 
+        createGrid<VType>(1);
-                                  + typeName<VType>() + "'");
+
        return grid4d_.at(key).get();
    }
 }
 template <typename VType>
-GridRedBlackCartesian * Environment::getRbGrid(void) const
+GridRedBlackCartesian * Environment::getRbGrid(void)
 {
-    auto it = gridRb4d_.find({typeHash<VType>(), 1});
+    FineGridKey key = {typeHash<VType>(), 1};
    auto        it  = gridRb4d_.find(key);
    if (it != gridRb4d_.end())
    {
@@ -325,34 +344,39 @@ GridRedBlackCartesian * Environment::getRbGrid(void) const
    }
    else
    {
-        HADRONS_ERROR(Definition, "no 4D red-black grid for SIMD type '" 
+        createGrid<VType>(1);
-                                  + typeName<VType>() + "'");
+
        return gridRb4d_.at(key).get();
    }
 }
 template <typename VType>
-GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize) const
+GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize)
 {
    std::vector<int> s = blockSize;
    s.resize(getNd());
-    auto it = gridCoarse4d_.find({typeHash<VType>(), s});
+
    CoarseGridKey key = {typeHash<VType>(), s};
    auto          it  = gridCoarse4d_.find(key);
    if (it != gridCoarse4d_.end())
    {
        return it->second.get();
    }
    else
    {
-        HADRONS_ERROR(Definition, "no 4D coarse grid for SIMD type '" 
+        createCoarseGrid<VType>(blockSize, 1);
-                                  + typeName<VType>() + "' and block size "
+        
-                                  + vecToStr(blockSize));
+        return gridCoarse4d_.at(key).get();
    }
 }
 template <typename VType>
-GridCartesian * Environment::getGrid(const unsigned int Ls) const
+GridCartesian * Environment::getGrid(const unsigned int Ls)
 {
-    auto it = grid5d_.find({typeHash<VType>(), Ls});
+    FineGridKey key = {typeHash<VType>(), Ls};
    auto        it  = grid5d_.find(key);
    if (it != grid5d_.end())
    {
@@ -360,16 +384,17 @@ GridCartesian * Environment::getGrid(const unsigned int Ls) const
    }
    else
    {
-        HADRONS_ERROR(Definition, "no 5D grid for SIMD type '" 
+        createGrid<VType>(Ls);
-                                  + typeName<VType>() + "' and Ls = "
+
-                                  + std::to_string(Ls));
+        return grid5d_.at(key).get();
    }
 }
 template <typename VType>
-GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls) const
+GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls)
 {
-    auto it = gridRb5d_.find({typeHash<VType>(), Ls});
+    FineGridKey key = {typeHash<VType>(), Ls};
    auto        it  = gridRb5d_.find(key);
    if (it != gridRb5d_.end())
    {
@@ -377,30 +402,32 @@ GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls) const
    }
    else
    {
-        HADRONS_ERROR(Definition, "no 5D red-black grid for SIMD type '" 
+        createGrid<VType>(Ls);
-                                  + typeName<VType>() + "' and Ls = "
+
-                                  + std::to_string(Ls));
+        return gridRb5d_.at(key).get();
    }
 }
 template <typename VType>
 GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize,
-                                           const unsigned int Ls) const
+                                           const unsigned int Ls)
 {
    std::vector<int> s = blockSize;
    s.push_back(Ls);
-    auto it = gridCoarse5d_.find({typeHash<VType>(), s});
+
    CoarseGridKey key = {typeHash<VType>(), s};
    auto it = gridCoarse5d_.find(key);
    if (it != gridCoarse5d_.end())
    {
        return it->second.get();
    }
    else
    {
-        HADRONS_ERROR(Definition, "no 5D coarse grid for SIMD type '" 
+        createCoarseGrid<VType>(blockSize, Ls);
-                                  + typeName<VType>() + "', block size "
+
-                                  + vecToStr(blockSize)
+        return gridCoarse5d_.at(key).get();
                                  + " and Ls = " + std::to_string(Ls));
    }
 }
--- a/Hadrons/Global.hpp
+++ b/Hadrons/Global.hpp
@@ -96,19 +96,22 @@ using Grid::operator>>;
 BEGIN_HADRONS_NAMESPACE
 // type aliases
 #define BASIC_TYPE_ALIASES(Impl, suffix)\
 typedef typename Impl::Field                         ScalarField##suffix;\
 typedef typename Impl::PropagatorField               PropagatorField##suffix;\
 typedef typename Impl::SitePropagator::scalar_object SitePropagator##suffix;\
 typedef std::vector<SitePropagator##suffix>          SlicedPropagator##suffix;
 #define FERM_TYPE_ALIASES(FImpl, suffix)\
-typedef FermionOperator<FImpl>                        FMat##suffix;            \
+BASIC_TYPE_ALIASES(FImpl, suffix);\
-typedef typename FImpl::FermionField                  FermionField##suffix;    \
+typedef FermionOperator<FImpl>            FMat##suffix;\
-typedef typename FImpl::PropagatorField               PropagatorField##suffix; \
+typedef typename FImpl::FermionField      FermionField##suffix;\
-typedef typename FImpl::SitePropagator::scalar_object SitePropagator##suffix;  \
+typedef typename FImpl::GaugeField        GaugeField##suffix;\
-typedef std::vector<SitePropagator##suffix>           SlicedPropagator##suffix;
+typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;\
 typedef typename FImpl::ComplexField      ComplexField##suffix;
-#define GAUGE_TYPE_ALIASES(FImpl, suffix)\
+#define GAUGE_TYPE_ALIASES(GImpl, suffix)\
-typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
+typedef typename GImpl::GaugeField GaugeField##suffix;
 #define SCALAR_TYPE_ALIASES(SImpl, suffix)\
 typedef typename SImpl::Field ScalarField##suffix;\
 typedef typename SImpl::Field PropagatorField##suffix;
 #define SOLVER_TYPE_ALIASES(FImpl, suffix)\
 typedef Solver<FImpl> Solver##suffix;
@@ -117,10 +120,6 @@ typedef Solver<FImpl> Solver##suffix;
 typedef std::function<SlicedPropagator##suffix\
                      (const PropagatorField##suffix &)> SinkFn##suffix;
 #define FG_TYPE_ALIASES(FImpl, suffix)\
 FERM_TYPE_ALIASES(FImpl, suffix)\
 GAUGE_TYPE_ALIASES(FImpl, suffix)
 // logger
 class HadronsLogger: public Logger
 {
--- a/Hadrons/Makefile.am
+++ b/Hadrons/Makefile.am
@@ -1,5 +1,6 @@
 SUBDIRS = . Utilities
 lib_LIBRARIES = libHadrons.a
 bin_PROGRAMS  = HadronsXmlRun
 include modules.inc
@@ -10,6 +11,7 @@ libHadrons_a_SOURCES = \
 	Exceptions.cc      \
    Global.cc          \
    Module.cc		   \
 	TimerArray.cc      \
 	VirtualMachine.cc
 libHadrons_adir = $(includedir)/Hadrons
 nobase_libHadrons_a_HEADERS = \
@@ -30,7 +32,5 @@ nobase_libHadrons_a_HEADERS = \
 	Modules.hpp               \
 	ModuleFactory.hpp         \
 	Solver.hpp                \
 	TimerArray.hpp            \
 	VirtualMachine.hpp
 HadronsXmlRun_SOURCES = HadronsXmlRun.cc
 HadronsXmlRun_LDADD   = libHadrons.a -lGrid
--- a/Hadrons/Module.cc
+++ b/Hadrons/Module.cc
@@ -66,101 +66,6 @@ void ModuleBase::operator()(void)
    stopAllTimers();
 }
 // timers //////////////////////////////////////////////////////////////////////
 void ModuleBase::startTimer(const std::string &name)
 {
    if (!name.empty())
    {
        timer_[name].Start();
    }
 }
 GridTime ModuleBase::getTimer(const std::string &name)
 {
    GridTime t;
    if (!name.empty())
    {
        try
        {
            bool running = timer_.at(name).isRunning();
            if (running) stopTimer(name);
            t = timer_.at(name).Elapsed();
            if (running) startTimer(name);
        }
        catch (std::out_of_range &)
        {
            t = GridTime::zero();
        }
    }
    else
    {
        t = GridTime::zero();
    }
    return t;
 }
 double ModuleBase::getDTimer(const std::string &name)
 {
    return static_cast<double>(getTimer(name).count());
 }
 void ModuleBase::startCurrentTimer(const std::string &name)
 {
    if (!name.empty())
    {
        stopCurrentTimer();
        startTimer(name);
        currentTimer_ = name;
    }
 }
 void ModuleBase::stopTimer(const std::string &name)
 {
    if (timer_.at(name).isRunning())
    {
        timer_.at(name).Stop();
    }
 }
 void ModuleBase::stopCurrentTimer(void)
 {
    if (!currentTimer_.empty())
    {
        stopTimer(currentTimer_);
        currentTimer_ = "";
    }
 }
 void ModuleBase::stopAllTimers(void)
 {
    for (auto &t: timer_)
    {
        stopTimer(t.first);
    }
    currentTimer_ = "";
 }
 void ModuleBase::resetTimers(void)
 {
    timer_.clear();
    currentTimer_ = "";
 }
 std::map<std::string, GridTime> ModuleBase::getTimings(void)
 {
    std::map<std::string, GridTime> timing;
    for (auto &t: timer_)
    {
        timing[t.first] = t.second.Elapsed();
    }
    return timing;
 }
 std::string ModuleBase::makeSeedString(void)
 {
    std::string seed;
--- a/Hadrons/Module.hpp
+++ b/Hadrons/Module.hpp
@@ -30,6 +30,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define Hadrons_Module_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Hadrons/VirtualMachine.hpp>
 BEGIN_HADRONS_NAMESPACE
@@ -65,7 +66,26 @@ extern template class base;\
 MODULE_REGISTER(mod, ARG(base), ns);
 #define ARG(...) __VA_ARGS__
-#define MACRO_REDIRECT(arg1, arg2, arg3, macro, ...) macro
+#define HADRONS_MACRO_REDIRECT_12(arg1, arg2, macro, ...) macro
 #define HADRONS_MACRO_REDIRECT_23(arg1, arg2, arg3, macro, ...) macro
 #define envGetGrid4(latticeType)\
 env().template getGrid<typename latticeType::vector_type>()
 #define envGetGrid5(latticeType, Ls)\
 env().template getGrid<typename latticeType::vector_type>(Ls)
 #define envGetGrid(...)\
 HADRONS_MACRO_REDIRECT_12(__VA_ARGS__, envGetGrid5, envGetGrid4)(__VA_ARGS__)
 #define envGetRbGrid4(latticeType)\
 env().template getRbGrid<typename latticeType::vector_type>()
 #define envGetRbGrid5(latticeType, Ls)\
 env().template getRbGrid<typename latticeType::vector_type>(Ls)
 #define envGetRbGrid(...)\
 HADRONS_MACRO_REDIRECT_12(__VA_ARGS__, envGetRbGrid5, envGetRbGrid4)(__VA_ARGS__)
 #define envGet(type, name)\
 *env().template getObject<type>(name)
@@ -86,38 +106,38 @@ env().template createObject<type>(name, Environment::Storage::object, Ls, __VA_A
 env().template createDerivedObject<base, type>(name, Environment::Storage::object, Ls, __VA_ARGS__)
 #define envCreateLat4(type, name)\
-envCreate(type, name, 1, env().getGrid())
+envCreate(type, name, 1, envGetGrid(type))
 #define envCreateLat5(type, name, Ls)\
-envCreate(type, name, Ls, env().getGrid(Ls))
+envCreate(type, name, Ls, envGetGrid(type, Ls))
 #define envCreateLat(...)\
-MACRO_REDIRECT(__VA_ARGS__, envCreateLat5, envCreateLat4)(__VA_ARGS__)
+HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envCreateLat5, envCreateLat4)(__VA_ARGS__)
 #define envCache(type, name, Ls, ...)\
 env().template createObject<type>(name, Environment::Storage::cache, Ls, __VA_ARGS__)
 #define envCacheLat4(type, name)\
-envCache(type, name, 1, env().getGrid())
+envCache(type, name, 1, envGetGrid(type))
 #define envCacheLat5(type, name, Ls)\
-envCache(type, name, Ls, env().getGrid(Ls))
+envCache(type, name, Ls, envGetGrid(type, Ls))
 #define envCacheLat(...)\
-MACRO_REDIRECT(__VA_ARGS__, envCacheLat5, envCacheLat4)(__VA_ARGS__)
+HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envCacheLat5, envCacheLat4)(__VA_ARGS__)
 #define envTmp(type, name, Ls, ...)\
 env().template createObject<type>(getName() + "_tmp_" + name,         \
                                  Environment::Storage::temporary, Ls, __VA_ARGS__)
 #define envTmpLat4(type, name)\
-envTmp(type, name, 1, env().getGrid())
+envTmp(type, name, 1, envGetGrid(type))
 #define envTmpLat5(type, name, Ls)\
-envTmp(type, name, Ls, env().getGrid(Ls))
+envTmp(type, name, Ls, envGetGrid(type, Ls))
 #define envTmpLat(...)\
-MACRO_REDIRECT(__VA_ARGS__, envTmpLat5, envTmpLat4)(__VA_ARGS__)
+HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envTmpLat5, envTmpLat4)(__VA_ARGS__)
 #define saveResult(ioStem, name, result)\
 if (env().getGrid()->IsBoss() and !ioStem.empty())\
@@ -133,7 +153,7 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
 *                            Module class                                    *
 ******************************************************************************/
 // base class
-class ModuleBase
+class ModuleBase: public TimerArray
 {
 public:
    // constructor
@@ -161,16 +181,6 @@ public:
    virtual void execute(void) = 0;
    // execution
    void operator()(void);
    // timers
    void                            startTimer(const std::string &name);
    GridTime                        getTimer(const std::string &name);
    double                          getDTimer(const std::string &name);
    void                            startCurrentTimer(const std::string &name);
    void                            stopTimer(const std::string &name);
    void                            stopCurrentTimer(void);
    void                            stopAllTimers(void);
    void                            resetTimers(void);
    std::map<std::string, GridTime> getTimings(void);
 protected:
    // environment shortcut
    DEFINE_ENV_ALIAS;
--- a/Hadrons/Modules.hpp
+++ b/Hadrons/Modules.hpp
@@ -1,33 +1,6 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/Baryon.hpp>
 #include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
 #include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
 #include <Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp>
 #include <Hadrons/Modules/MContraction/Meson.hpp>
 #include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 #include <Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
@@ -57,6 +30,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
 #include <Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
 #include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
 #include <Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
 #include <Hadrons/Modules/MUtilities/TestSeqConserved.hpp>
--- a/Hadrons/Modules/MAction/DWF.cc
+++ b/Hadrons/Modules/MAction/DWF.cc
@@ -32,4 +32,4 @@ using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TDWF<FIMPL>;
-
+template class Grid::Hadrons::MAction::TDWF<FIMPLF>;
--- a/Hadrons/Modules/MAction/DWF.hpp
+++ b/Hadrons/Modules/MAction/DWF.hpp
@@ -56,7 +56,7 @@ template <typename FImpl>
 class TDWF: public Module<DWFPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TDWF(const std::string name);
@@ -73,6 +73,7 @@ protected:
 };
 MODULE_REGISTER_TMP(DWF, TDWF<FIMPL>, MAction);
 MODULE_REGISTER_TMP(DWFF, TDWF<FIMPLF>, MAction);
 /******************************************************************************
 *                        DWF template implementation                         *
@@ -111,12 +112,11 @@ void TDWF<FImpl>::setup(void)
    LOG(Message) << "Fermion boundary conditions: " << par().boundary
                 << std::endl;
-    env().createGrid(par().Ls);
+    auto &U    = envGet(GaugeField, par().gauge);
-    auto &U    = envGet(LatticeGaugeField, par().gauge);
+    auto &g4   = *envGetGrid(FermionField);
-    auto &g4   = *env().getGrid();
+    auto &grb4 = *envGetRbGrid(FermionField);
-    auto &grb4 = *env().getRbGrid();
+    auto &g5   = *envGetGrid(FermionField, par().Ls);
-    auto &g5   = *env().getGrid(par().Ls);
+    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    auto &grb5 = *env().getRbGrid(par().Ls);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename DomainWallFermion<FImpl>::ImplParams implParams(boundary);
    envCreateDerived(FMat, DomainWallFermion<FImpl>, getName(), par().Ls, U, g5,
--- a/Hadrons/Modules/MAction/MobiusDWF.cc
+++ b/Hadrons/Modules/MAction/MobiusDWF.cc
@@ -32,3 +32,4 @@ using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPL>;
 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPLF>;
--- a/Hadrons/Modules/MAction/MobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/MobiusDWF.hpp
@@ -56,7 +56,7 @@ template <typename FImpl>
 class TMobiusDWF: public Module<MobiusDWFPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TMobiusDWF(const std::string name);
@@ -72,6 +72,7 @@ public:
 };
 MODULE_REGISTER_TMP(MobiusDWF, TMobiusDWF<FIMPL>, MAction);
 MODULE_REGISTER_TMP(MobiusDWFF, TMobiusDWF<FIMPLF>, MAction);
 /******************************************************************************
 *                      TMobiusDWF implementation                             *
@@ -111,12 +112,11 @@ void TMobiusDWF<FImpl>::setup(void)
    LOG(Message) << "Fermion boundary conditions: " << par().boundary
                 << std::endl;
-    env().createGrid(par().Ls);
+    auto &U    = envGet(GaugeField, par().gauge);
-    auto &U    = envGet(LatticeGaugeField, par().gauge);
+    auto &g4   = *envGetGrid(FermionField);
-    auto &g4   = *env().getGrid();
+    auto &grb4 = *envGetRbGrid(FermionField);
-    auto &grb4 = *env().getRbGrid();
+    auto &g5   = *envGetGrid(FermionField, par().Ls);
-    auto &g5   = *env().getGrid(par().Ls);
+    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    auto &grb5 = *env().getRbGrid(par().Ls);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename MobiusFermion<FImpl>::ImplParams implParams(boundary);
    envCreateDerived(FMat, MobiusFermion<FImpl>, getName(), par().Ls, U, g5,
--- a/Hadrons/Modules/MAction/ScaledDWF.cc
+++ b/Hadrons/Modules/MAction/ScaledDWF.cc
@@ -32,3 +32,4 @@ using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TScaledDWF<FIMPL>;
 template class Grid::Hadrons::MAction::TScaledDWF<FIMPLF>;
--- a/Hadrons/Modules/MAction/ScaledDWF.hpp
+++ b/Hadrons/Modules/MAction/ScaledDWF.hpp
@@ -55,7 +55,7 @@ template <typename FImpl>
 class TScaledDWF: public Module<ScaledDWFPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TScaledDWF(const std::string name);
@@ -71,6 +71,7 @@ public:
 };
 MODULE_REGISTER_TMP(ScaledDWF, TScaledDWF<FIMPL>, MAction);
 MODULE_REGISTER_TMP(ScaledDWFF, TScaledDWF<FIMPLF>, MAction);
 /******************************************************************************
 *                      TScaledDWF implementation                             *
@@ -110,12 +111,11 @@ void TScaledDWF<FImpl>::setup(void)
    LOG(Message) << "Fermion boundary conditions: " << par().boundary
                 << std::endl;
-    env().createGrid(par().Ls);
+    auto &U    = envGet(GaugeField, par().gauge);
-    auto &U    = envGet(LatticeGaugeField, par().gauge);
+    auto &g4   = *envGetGrid(FermionField);
-    auto &g4   = *env().getGrid();
+    auto &grb4 = *envGetRbGrid(FermionField);
-    auto &grb4 = *env().getRbGrid();
+    auto &g5   = *envGetGrid(FermionField, par().Ls);
-    auto &g5   = *env().getGrid(par().Ls);
+    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    auto &grb5 = *env().getRbGrid(par().Ls);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename MobiusFermion<FImpl>::ImplParams implParams(boundary);
    envCreateDerived(FMat, ScaledShamirFermion<FImpl>, getName(), par().Ls, U, g5,
--- a/Hadrons/Modules/MAction/Wilson.cc
+++ b/Hadrons/Modules/MAction/Wilson.cc
@@ -32,4 +32,4 @@ using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TWilson<FIMPL>;
-
+template class Grid::Hadrons::MAction::TWilson<FIMPLF>;
--- a/Hadrons/Modules/MAction/Wilson.hpp
+++ b/Hadrons/Modules/MAction/Wilson.hpp
@@ -54,7 +54,7 @@ template <typename FImpl>
 class TWilson: public Module<WilsonPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWilson(const std::string name);
@@ -71,6 +71,7 @@ protected:
 };
 MODULE_REGISTER_TMP(Wilson, TWilson<FIMPL>, MAction);
 MODULE_REGISTER_TMP(WilsonF, TWilson<FIMPLF>, MAction);
 /******************************************************************************
 *                     TWilson template implementation                        *
@@ -107,9 +108,9 @@ void TWilson<FImpl>::setup(void)
    LOG(Message) << "Fermion boundary conditions: " << par().boundary
                 << std::endl;
-    auto &U      = envGet(LatticeGaugeField, par().gauge);
+    auto &U      = envGet(GaugeField, par().gauge);
-    auto &grid   = *env().getGrid();
+    auto &grid   = *envGetGrid(FermionField);
-    auto &gridRb = *env().getRbGrid();
+    auto &gridRb = *envGetRbGrid(FermionField);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename WilsonFermion<FImpl>::ImplParams implParams(boundary);
    envCreateDerived(FMat, WilsonFermion<FImpl>, getName(), 1, U, grid, gridRb,
--- a/Hadrons/Modules/MAction/WilsonClover.cc
+++ b/Hadrons/Modules/MAction/WilsonClover.cc
@@ -32,4 +32,4 @@ using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TWilsonClover<FIMPL>;
-
+template class Grid::Hadrons::MAction::TWilsonClover<FIMPLF>;
--- a/Hadrons/Modules/MAction/WilsonClover.hpp
+++ b/Hadrons/Modules/MAction/WilsonClover.hpp
@@ -59,7 +59,7 @@ template <typename FImpl>
 class TWilsonClover: public Module<WilsonCloverPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWilsonClover(const std::string name);
@@ -75,9 +75,10 @@ public:
 };
 MODULE_REGISTER_TMP(WilsonClover, TWilsonClover<FIMPL>, MAction);
 MODULE_REGISTER_TMP(WilsonCloverF, TWilsonClover<FIMPLF>, MAction);
 /******************************************************************************
- *                     TWilsonClover template implementation                        *
+ *                    TWilsonClover template implementation                   *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
@@ -113,16 +114,14 @@ void TWilsonClover<FImpl>::setup(void)
    LOG(Message) << "Clover term csw_r: " << par().csw_r
                 << " csw_t: " << par().csw_t
                 << std::endl;
-    auto &U      = envGet(LatticeGaugeField, par().gauge);
+    auto &U      = envGet(GaugeField, par().gauge);
-    auto &grid   = *env().getGrid();
+    auto &grid   = *envGetGrid(FermionField);
-    auto &gridRb = *env().getRbGrid();
+    auto &gridRb = *envGetRbGrid(FermionField);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename WilsonCloverFermion<FImpl>::ImplParams implParams(boundary);
-    envCreateDerived(FMat, WilsonCloverFermion<FImpl>, getName(), 1, U, grid, gridRb, par().mass,
+    envCreateDerived(FMat, WilsonCloverFermion<FImpl>, getName(), 1, U, grid,
-						  par().csw_r,
+                     gridRb, par().mass, par().csw_r, par().csw_t, 
-						  par().csw_t,
+                     par().clover_anisotropy, implParams); 
 					      par().clover_anisotropy,
 						  implParams); 
 }
 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/Modules/MAction/ZMobiusDWF.cc
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.cc
@@ -32,4 +32,4 @@ using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPL>;
-
+template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPLF>;
--- a/Hadrons/Modules/MAction/ZMobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.hpp
@@ -57,7 +57,7 @@ template <typename FImpl>
 class TZMobiusDWF: public Module<ZMobiusDWFPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TZMobiusDWF(const std::string name);
@@ -73,6 +73,7 @@ public:
 };
 MODULE_REGISTER_TMP(ZMobiusDWF, TZMobiusDWF<ZFIMPL>, MAction);
 MODULE_REGISTER_TMP(ZMobiusDWFF, TZMobiusDWF<ZFIMPLF>, MAction);
 /******************************************************************************
 *                     TZMobiusDWF implementation                             *
@@ -118,11 +119,11 @@ void TZMobiusDWF<FImpl>::setup(void)
                 << std::endl;
    env().createGrid(par().Ls);
-    auto &U    = envGet(LatticeGaugeField, par().gauge);
+    auto &U    = envGet(GaugeField, par().gauge);
-    auto &g4   = *env().getGrid();
+    auto &g4   = *envGetGrid(FermionField);
-    auto &grb4 = *env().getRbGrid();
+    auto &grb4 = *envGetRbGrid(FermionField);
-    auto &g5   = *env().getGrid(par().Ls);
+    auto &g5   = *envGetGrid(FermionField, par().Ls);
-    auto &grb5 = *env().getRbGrid(par().Ls);
+    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    auto omega = par().omega;
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename ZMobiusFermion<FImpl>::ImplParams implParams(boundary);
--- a/Hadrons/Modules/MContraction/A2AAslashField.cc
+++ b/Hadrons/Modules/MContraction/A2AAslashField.cc
@@ -0,0 +1,7 @@
 #include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TA2AAslashField<FIMPL, PhotonR>;
--- a/Hadrons/Modules/MContraction/A2AAslashField.hpp
+++ b/Hadrons/Modules/MContraction/A2AAslashField.hpp
@@ -0,0 +1,223 @@
 #ifndef Hadrons_MContraction_A2AAslashField_hpp_
 #define Hadrons_MContraction_A2AAslashField_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #ifndef ASF_IO_TYPE
 #define ASF_IO_TYPE ComplexF
 #endif
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         A2AAslashField                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 class A2AAslashFieldPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashFieldPar,
                                    int, cacheBlock,
                                    int, block,
                                    std::string, left,
                                    std::string, right,
                                    std::string, output,
                                    std::vector<std::string>, emField);
 };
 class A2AAslashFieldMetadata: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashFieldMetadata,
                                    std::string, emFieldName);
 };
 template <typename T, typename FImpl>
 class AslashFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    AslashFieldKernel(const std::vector<LatticeComplex> &emB0,
                      const std::vector<LatticeComplex> &emB1,
                      GridBase *grid)
    : emB0_(emB0), emB1_(emB1), grid_(grid)
    {
        vol_ = 1.;
        for (auto &d: grid_->GlobalDimensions())
        {
            vol_ *= d;
        }
    }
    virtual ~AslashFieldKernel(void) = default;
    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
                            const FermionField *right,
                            const unsigned int orthogDim, double &t)
    {
        A2Autils<FImpl>::AslashField(m, left, right, emB0_, emB1_, orthogDim, &t);
    }
    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
    {
        return 0.;
    }
    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
    {
        return 0.;
    }
 private:
    const std::vector<LatticeComplex> &emB0_, &emB1_;
    GridBase                          *grid_;
    double                            vol_;
 };
 template <typename FImpl, typename PhotonImpl>
 class TA2AAslashField: public Module<A2AAslashFieldPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    typedef typename PhotonImpl::GaugeField EmField;
    typedef A2AMatrixBlockComputation<Complex, 
                                      FermionField, 
                                      A2AAslashFieldMetadata, 
                                      ASF_IO_TYPE> Computation;
    typedef AslashFieldKernel<Complex, FImpl> Kernel;
 public:
    // constructor
    TA2AAslashField(const std::string name);
    // destructor
    virtual ~TA2AAslashField(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(A2AAslashField, ARG(TA2AAslashField<FIMPL, PhotonR>), MContraction);
 /******************************************************************************
 *                 TA2AAslashField implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl, typename PhotonImpl>
 TA2AAslashField<FImpl, PhotonImpl>::TA2AAslashField(const std::string name)
 : Module<A2AAslashFieldPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl, typename PhotonImpl>
 std::vector<std::string> TA2AAslashField<FImpl, PhotonImpl>::getInput(void)
 {
    std::vector<std::string> in = par().emField;
    in.push_back(par().left);
    in.push_back(par().right);
    return in;
 }
 template <typename FImpl, typename PhotonImpl>
 std::vector<std::string> TA2AAslashField<FImpl, PhotonImpl>::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl, typename PhotonImpl>
 void TA2AAslashField<FImpl, PhotonImpl>::setup(void)
 {
    envTmp(Computation, "computation", 1, envGetGrid(FermionField), 
           env().getNd() - 1, par().emField.size(), 1, par().block, 
           par().cacheBlock, this);
    envTmp(std::vector<ComplexField>, "B0", 1, 
           par().emField.size(), envGetGrid(ComplexField));
    envTmp(std::vector<ComplexField>, "B1", 1, 
           par().emField.size(), envGetGrid(ComplexField));
    envTmpLat(ComplexField, "Amu");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl, typename PhotonImpl>
 void TA2AAslashField<FImpl, PhotonImpl>::execute(void)
 {
    auto &left  = envGet(std::vector<FermionField>, par().left);
    auto &right = envGet(std::vector<FermionField>, par().right);
    int nt         = env().getDim().back();
    int N_i        = left.size();
    int N_j        = right.size();
    int nem        = par().emField.size();
    int block      = par().block;
    int cacheBlock = par().cacheBlock;
    LOG(Message) << "Computing all-to-all A-slash fields" << std::endl;
    LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
    LOG(Message) << "EM fields:" << std::endl;
    for (auto &name: par().emField)
    {
        LOG(Message) << "  " << name << std::endl;
    }
    LOG(Message) << "A-slash field size: " << nt << "*" << N_i << "*" << N_j 
                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(ASF_IO_TYPE)) 
                 << "/EM field)" << std::endl;
    // preparing "B" complexified fields
    startTimer("Complexify EM fields");
    envGetTmp(std::vector<ComplexField>, B0);
    envGetTmp(std::vector<ComplexField>, B1);
    for (unsigned int i = 0; i < par().emField.size(); ++i)
    {
        auto &A = envGet(EmField, par().emField[i]);
        envGetTmp(ComplexField, Amu);
        B0[i]  = peekLorentz(A, 0);
        B0[i] += timesI(peekLorentz(A, 1));
        B1[i]  = peekLorentz(A, 2);
        B1[i] += timesI(peekLorentz(A, 3));
    }
    stopTimer("Complexify EM fields");
    // I/O name & metadata lambdas
    auto ionameFn = [this](const unsigned int em, const unsigned int dummy)
    {
        return par().emField[em];
    };
    auto filenameFn = [this, &ionameFn](const unsigned int em, const unsigned int dummy)
    {
        return par().output + "." + std::to_string(vm().getTrajectory()) 
               + "/" + ionameFn(em, dummy) + ".h5";
    };
    auto metadataFn = [this](const unsigned int em, const unsigned int dummy)
    {
        A2AAslashFieldMetadata md;
        md.emFieldName = par().emField[em];
        return md;
    };
    // executing computation
    Kernel kernel(B0, B1, envGetGrid(FermionField));
    envGetTmp(Computation, computation);
    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_A2AAslashField_hpp_
--- a/Hadrons/Modules/MContraction/A2AMesonField.cc
+++ b/Hadrons/Modules/MContraction/A2AMesonField.cc
@@ -33,4 +33,3 @@ using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TA2AMesonField<FIMPL>;
 template class Grid::Hadrons::MContraction::TA2AMesonField<ZFIMPL>;
--- a/Hadrons/Modules/MContraction/A2AMesonField.hpp
+++ b/Hadrons/Modules/MContraction/A2AMesonField.hpp
@@ -33,12 +33,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AVectors.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #include <Hadrons/Modules/MSolver/A2AVectors.hpp>
 #include <Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp>
 #define MF_PARALLEL_IO
 #ifndef MF_IO_TYPE
 #define MF_IO_TYPE ComplexF
 #endif
@@ -56,8 +52,8 @@ public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldPar,
                                    int, cacheBlock,
                                    int, block,
-                                    std::string, v,
+                                    std::string, left,
-                                    std::string, w,
+                                    std::string, right,
                                    std::string, output,
                                    std::string, gammas,
                                    std::vector<std::string>, mom);
@@ -71,21 +67,59 @@ public:
                                    Gamma::Algebra, gamma);
 };
 template <typename T, typename FImpl>
 class MesonFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    MesonFieldKernel(const std::vector<Gamma::Algebra> &gamma,
                     const std::vector<LatticeComplex> &mom,
                     GridBase *grid)
    : gamma_(gamma), mom_(mom), grid_(grid)
    {
        vol_ = 1.;
        for (auto &d: grid_->GlobalDimensions())
        {
            vol_ *= d;
        }
    }
    virtual ~MesonFieldKernel(void) = default;
    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
                            const FermionField *right,
                            const unsigned int orthogDim, double &t)
    {
        A2Autils<FImpl>::MesonField(m, left, right, gamma_, mom_, orthogDim, &t);
    }
    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
    {
        return vol_*(2*8.0+6.0+8.0*mom_.size())*blockSizei*blockSizej*gamma_.size();
    }
    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
    {
        return vol_*(12.0*sizeof(T))*blockSizei*blockSizej
               +  vol_*(2.0*sizeof(T)*mom_.size())*blockSizei*blockSizej*gamma_.size();
    }
 private:
    const std::vector<Gamma::Algebra> &gamma_;
    const std::vector<LatticeComplex> &mom_;
    GridBase                          *grid_;
    double                            vol_;
 };
 template <typename FImpl>
 class TA2AMesonField : public Module<A2AMesonFieldPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
-    SOLVER_TYPE_ALIASES(FImpl,);
+    typedef A2AMatrixBlockComputation<Complex, 
-    typedef Eigen::TensorMap<Eigen::Tensor<Complex, 5, Eigen::RowMajor>>    MesonField;
+                                      FermionField, 
-    typedef Eigen::TensorMap<Eigen::Tensor<MF_IO_TYPE, 5, Eigen::RowMajor>> MesonFieldIo;
+                                      A2AMesonFieldMetadata, 
-    typedef A2AMatrixIo<MF_IO_TYPE, A2AMesonFieldMetadata>                  MatrixIo;
+                                      MF_IO_TYPE> Computation;
-    struct IoHelper
+    typedef MesonFieldKernel<Complex, FImpl> Kernel;
    {
        MatrixIo              io;
        A2AMesonFieldMetadata metadata;
        size_t                offset;
    };
 public:
    // constructor
    TA2AMesonField(const std::string name);
@@ -99,20 +133,13 @@ public:
    // execution
    virtual void execute(void);
 private:
-    // IO
+    bool                               hasPhase_{false};
-    std::string ioname(unsigned int m, unsigned int g) const;
+    std::string                        momphName_;
-    std::string filename(unsigned int m, unsigned int g) const;
+    std::vector<Gamma::Algebra>        gamma_;
-    void saveBlock(const MF_IO_TYPE *data, IoHelper &h, unsigned int i, unsigned int j);
+    std::vector<std::vector<Real>>     mom_;
 private:
    bool                                               hasPhase_{false};
    std::string                                        momphName_;
    std::vector<Gamma::Algebra>                        gamma_;
    std::vector<std::vector<Real>>                     mom_;
    std::vector<IoHelper>                              nodeIo_;
 };
 MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField<FIMPL>), MContraction);
 MODULE_REGISTER(ZA2AMesonField, ARG(TA2AMesonField<ZFIMPL>), MContraction);
 /******************************************************************************
 *                  TA2AMesonField implementation                             *
@@ -129,7 +156,7 @@ TA2AMesonField<FImpl>::TA2AMesonField(const std::string name)
 template <typename FImpl>
 std::vector<std::string> TA2AMesonField<FImpl>::getInput(void)
 {
-    std::vector<std::string> in = {par().v, par().w};
+    std::vector<std::string> in = {par().left, par().right};
    return in;
 }
@@ -185,34 +212,31 @@ void TA2AMesonField<FImpl>::setup(void)
        }
        mom_.push_back(p);
    }
-    
+    envCache(std::vector<ComplexField>, momphName_, 1, 
-    envCache(std::vector<LatticeComplex>, momphName_, 1, 
+             par().mom.size(), envGetGrid(ComplexField));
-             par().mom.size(), env().getGrid());
+    envTmpLat(ComplexField, "coor");
-    envTmpLat(LatticeComplex, "coor");
+    envTmp(Computation, "computation", 1, envGetGrid(FermionField), 
-    // preallocate memory for meson field block
+           env().getNd() - 1, mom_.size(), gamma_.size(), par().block, 
-    auto tgp = env().getDim().back()*gamma_.size()*mom_.size();
+           par().cacheBlock, this);
    envTmp(Vector<MF_IO_TYPE>, "mfBuf", 1, tgp*par().block*par().block);
    envTmp(Vector<Complex>, "mfCache", 1, tgp*par().cacheBlock*par().cacheBlock);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TA2AMesonField<FImpl>::execute(void)
 {
-    auto &v = envGet(std::vector<FermionField>, par().v);
+    auto &left  = envGet(std::vector<FermionField>, par().left);
-    auto &w = envGet(std::vector<FermionField>, par().w);
+    auto &right = envGet(std::vector<FermionField>, par().right);
    int nt         = env().getDim().back();
-    int N_i        = w.size();
+    int N_i        = left.size();
-    int N_j        = v.size();
+    int N_j        = right.size();
    int ngamma     = gamma_.size();
    int nmom       = mom_.size();
    int block      = par().block;
    int cacheBlock = par().cacheBlock;
    LOG(Message) << "Computing all-to-all meson fields" << std::endl;
-    LOG(Message) << "W: '" << par().w << "' V: '" << par().v << "'" << std::endl;
+    LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
    LOG(Message) << "Momenta:" << std::endl;
    for (auto &p: mom_)
    {
@@ -227,10 +251,7 @@ void TA2AMesonField<FImpl>::execute(void)
                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(MF_IO_TYPE)) 
                 << "/momentum/bilinear)" << std::endl;
-    ///////////////////////////////////////////////
+    auto &ph = envGet(std::vector<ComplexField>, momphName_);
    // Momentum setup
    ///////////////////////////////////////////////
    auto &ph = envGet(std::vector<LatticeComplex>, momphName_);
    if (!hasPhase_)
    {
@@ -240,7 +261,7 @@ void TA2AMesonField<FImpl>::execute(void)
            Complex           i(0.0,1.0);
            std::vector<Real> p;
-            envGetTmp(LatticeComplex, coor);
+            envGetTmp(ComplexField, coor);
            ph[j] = zero;
            for(unsigned int mu = 0; mu < mom_[j].size(); mu++)
            {
@@ -252,182 +273,43 @@ void TA2AMesonField<FImpl>::execute(void)
        hasPhase_ = true;
        stopTimer("Momentum phases");
    }
    //////////////////////////////////////////////////////////////////////////
    // i,j   is first  loop over SchurBlock factors reusing 5D matrices
    // ii,jj is second loop over cacheBlock factors for high perf contractoin
    // iii,jjj are loops within cacheBlock
    // Total index is sum of these  i+ii+iii etc...
    //////////////////////////////////////////////////////////////////////////
    double flops;
    double bytes;
    double vol      = env().getVolume();
    double t_kernel = 0.0;
    double nodes    = env().getGrid()->NodeCount();
    double tot_kernel;
-    envGetTmp(Vector<MF_IO_TYPE>, mfBuf);
+    auto ionameFn = [this](const unsigned int m, const unsigned int g)
    envGetTmp(Vector<Complex>, mfCache);
    double t0    = usecond();
    int NBlock_i = N_i/block + (((N_i % block) != 0) ? 1 : 0);
    int NBlock_j = N_j/block + (((N_j % block) != 0) ? 1 : 0);
    for(int i=0;i<N_i;i+=block)
    for(int j=0;j<N_j;j+=block)
    {
-        // Get the W and V vectors for this block^2 set of terms
+        std::stringstream ss;
        int N_ii = MIN(N_i-i,block);
        int N_jj = MIN(N_j-j,block);
-        LOG(Message) << "Meson field block " 
+        ss << gamma_[g] << "_";
-                    << j/block + NBlock_j*i/block + 1 
+        for (unsigned int mu = 0; mu < mom_[m].size(); ++mu)
                    << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
                    << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
                    << std::endl;
        MesonFieldIo mfBlock(mfBuf.data(),nmom,ngamma,nt,N_ii,N_jj);
        // Series of cache blocked chunks of the contractions within this block
        flops = 0.0;
        bytes = 0.0;
        for(int ii=0;ii<N_ii;ii+=cacheBlock)
        for(int jj=0;jj<N_jj;jj+=cacheBlock)
        {
-            int N_iii = MIN(N_ii-ii,cacheBlock);
+            ss << mom_[m][mu] << ((mu == mom_[m].size() - 1) ? "" : "_");
            int N_jjj = MIN(N_jj-jj,cacheBlock);
            MesonField mfCacheBlock(mfCache.data(),nmom,ngamma,nt,N_iii,N_jjj);    
            startTimer("contraction: total");
            makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph, 
                                env().getNd() - 1, this);
            stopTimer("contraction: total");
            // flops for general N_c & N_s
            flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma;
            bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
                     +  vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;
            startTimer("cache copy");
            parallel_for_nest5(int m =0;m< nmom;m++)
            for(int g =0;g< ngamma;g++)
            for(int t =0;t< nt;t++)
            for(int iii=0;iii< N_iii;iii++)
            for(int jjj=0;jjj< N_jjj;jjj++)
            {
                mfBlock(m,g,t,ii+iii,jj+jjj) = mfCacheBlock(m,g,t,iii,jjj);
            }
            stopTimer("cache copy");
        }
-        // perf
+        return ss.str();
-        tot_kernel = getDTimer("contraction: colour trace & mom.")
+    };
                     + getDTimer("contraction: local space sum");
        t_kernel   = tot_kernel - t_kernel;
        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
                     << " Gflop/s/node " << std::endl;
        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
                     << " GB/s/node "  << std::endl;
        t_kernel = tot_kernel;
-        // IO
+    auto filenameFn = [this, &ionameFn](const unsigned int m, const unsigned int g)
-        if (!par().output.empty())
+    {
        return par().output + "." + std::to_string(vm().getTrajectory()) 
               + "/" + ionameFn(m, g) + ".h5";
    };
    auto metadataFn = [this](const unsigned int m, const unsigned int g)
    {
        A2AMesonFieldMetadata md;
        for (auto pmu: mom_[m])
        {
-            double       blockSize, ioTime;
+            md.momentum.push_back(pmu);
-            unsigned int myRank = env().getGrid()->ThisRank(),
+        }
-                         nRank  = env().getGrid()->RankCount();
+        md.gamma = gamma_[g];
-            LOG(Message) << "Writing block to disk" << std::endl;
+        return md;
-            ioTime = -getDTimer("IO: write block");
+    };
            startTimer("IO: total");
            makeFileDir(filename(0, 0), env().getGrid());
 #ifdef MF_PARALLEL_IO
            env().getGrid()->Barrier();
            nodeIo_.clear();
            for(int f = myRank; f < nmom*ngamma; f += nRank)
            {
                const unsigned int    m = f/ngamma, g = f % ngamma;
                IoHelper              h;
-                h.io = MatrixIo(filename(m, g), ioname(m, g), nt, N_i, N_j, block);
+    Kernel      kernel(gamma_, ph, envGetGrid(FermionField));
                for (auto pmu: mom_[m])
                {
                    h.metadata.momentum.push_back(pmu);
                }
                h.metadata.gamma = gamma_[g];
                h.offset         = (m*ngamma + g)*nt*block*block;
                nodeIo_.push_back(h);
            }
            // parallel IO
            for (auto &h: nodeIo_)
            {
                saveBlock(mfBlock.data(), h, i, j);
            }
            env().getGrid()->Barrier();
 #else
            // serial IO
            for(int m = 0; m < nmom; m++)
            for(int g = 0; g < ngamma; g++)
            {
                IoHelper h;
-                h.io = MatrixIo(filename(m, g), ioname(m, g), nt, N_i, N_j, block);
+    envGetTmp(Computation, computation);
-                for (auto pmu: mom_[m])
+    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
                {
                    h.metadata.momentum.push_back(pmu);
                }
                h.metadata.gamma = gamma_[g];
                h.offset         = (m*ngamma + g)*nt*block*block;
                saveBlock(mfBlock.data(), h, i, j);
            }
 #endif
            stopTimer("IO: total");
            blockSize  = static_cast<double>(nmom*ngamma*nt*N_ii*N_jj*sizeof(MF_IO_TYPE));
            ioTime    += getDTimer("IO: write block");
            LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
                         << ioTime  << " us (" 
                         << blockSize/ioTime*1.0e6/1024/1024
                         << " MB/s)" << std::endl;
        }
    }
 }
 // IO
 template <typename FImpl>
 std::string TA2AMesonField<FImpl>::ioname(unsigned int m, unsigned int g) const
 {
    std::stringstream ss;
    ss << gamma_[g] << "_";
    for (unsigned int mu = 0; mu < mom_[m].size(); ++mu)
    {
        ss << mom_[m][mu] << ((mu == mom_[m].size() - 1) ? "" : "_");
    }
    return ss.str();
 }
 template <typename FImpl>
 std::string TA2AMesonField<FImpl>::filename(unsigned int m, unsigned int g) const
 {
    return par().output + "." + std::to_string(vm().getTrajectory()) 
           + "/" + ioname(m, g) + ".h5";
 }
 template <typename FImpl>
 void TA2AMesonField<FImpl>::saveBlock(const MF_IO_TYPE *data, IoHelper &h, 
                                      unsigned int i, unsigned int j)
 {
    if ((i == 0) and (j == 0))
    {
        startTimer("IO: file creation");
        h.io.initFile(h.metadata);
        stopTimer("IO: file creation");
    }
    startTimer("IO: write block");
    h.io.saveBlock(data + h.offset, i, j);
    stopTimer("IO: write block");
 }
 END_MODULE_NAMESPACE
--- a/Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
+++ b/Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
@@ -1,224 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_A2AMesonFieldKernels_hpp_
 #define Hadrons_MContraction_A2AMesonFieldKernels_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
 BEGIN_HADRONS_NAMESPACE
 BEGIN_MODULE_NAMESPACE(MContraction)
 ////////////////////////////////////////////////////////////////////////////////
 // Cache blocked arithmetic routine
 // Could move to Grid ???
 ////////////////////////////////////////////////////////////////////////////////
 template <typename Field, typename MesonField>
 void makeMesonFieldBlock(MesonField &mat, 
                         const Field *lhs_wi,
                         const Field *rhs_vj,
                         std::vector<Gamma::Algebra> gamma,
                         const std::vector<LatticeComplex> &mom,
                         int orthogdim,
                         ModuleBase *caller = nullptr) 
 {
    typedef typename Field::vector_object vobj;
    typedef typename vobj::scalar_object  sobj;
    typedef typename vobj::scalar_type    scalar_type;
    typedef typename vobj::vector_type    vector_type;
    typedef iSpinMatrix<vector_type> SpinMatrix_v;
    typedef iSpinMatrix<scalar_type> SpinMatrix_s;
    int Lblock = mat.dimension(3); 
    int Rblock = mat.dimension(4);
    GridBase *grid = lhs_wi[0]._grid;
    const int    Nd = grid->_ndimension;
    const int Nsimd = grid->Nsimd();
    int Nt     = grid->GlobalDimensions()[orthogdim];
    int Ngamma = gamma.size();
    int Nmom   = mom.size();
    int fd=grid->_fdimensions[orthogdim];
    int ld=grid->_ldimensions[orthogdim];
    int rd=grid->_rdimensions[orthogdim];
    // will locally sum vectors first
    // sum across these down to scalars
    // splitting the SIMD
    int MFrvol = rd*Lblock*Rblock*Nmom;
    int MFlvol = ld*Lblock*Rblock*Nmom;
    Vector<SpinMatrix_v > lvSum(MFrvol);
    parallel_for (int r = 0; r < MFrvol; r++)
    {
        lvSum[r] = zero;
    }
    Vector<SpinMatrix_s > lsSum(MFlvol);             
    parallel_for (int r = 0; r < MFlvol; r++)
    {
        lsSum[r]=scalar_type(0.0);
    }
    int e1=    grid->_slice_nblock[orthogdim];
    int e2=    grid->_slice_block [orthogdim];
    int stride=grid->_slice_stride[orthogdim];
    if (caller) caller->startTimer("contraction: colour trace & mom.");
    // Nested parallelism would be ok
    // Wasting cores here. Test case r
    parallel_for(int r=0;r<rd;r++)
    {
        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
        for(int n=0;n<e1;n++)
        for(int b=0;b<e2;b++)
        {
            int ss= so+n*stride+b;
            for(int i=0;i<Lblock;i++)
            {
                auto left = conjugate(lhs_wi[i]._odata[ss]);
                for(int j=0;j<Rblock;j++)
                {
                    SpinMatrix_v vv;
                    auto right = rhs_vj[j]._odata[ss];
                    for(int s1=0;s1<Ns;s1++)
                    for(int s2=0;s2<Ns;s2++)
                    {
                        vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
                                        + left()(s2)(1) * right()(s1)(1)
                                        + left()(s2)(2) * right()(s1)(2);
                    }
                    // After getting the sitewise product do the mom phase loop
                    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
                    for ( int m=0;m<Nmom;m++)
                    {
                        int idx = m+base;
                        auto phase = mom[m]._odata[ss];
                        mac(&lvSum[idx],&vv,&phase);
                    }
                }
            }
        }
    }
    if (caller) caller->stopTimer("contraction: colour trace & mom.");
    // Sum across simd lanes in the plane, breaking out orthog dir.
    if (caller) caller->startTimer("contraction: local space sum");
    parallel_for(int rt=0;rt<rd;rt++)
    {
        std::vector<int> icoor(Nd);
        std::vector<SpinMatrix_s> extracted(Nsimd);               
        for(int i=0;i<Lblock;i++)
        for(int j=0;j<Rblock;j++)
        for(int m=0;m<Nmom;m++)
        {
            int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
            extract(lvSum[ij_rdx],extracted);
            for(int idx=0;idx<Nsimd;idx++)
            {
                grid->iCoorFromIindex(icoor,idx);
                int ldx    = rt+icoor[orthogdim]*rd;
                int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
            }
        }
    }
    if (caller) caller->stopTimer("contraction: local space sum");
    // ld loop and local only??
    if (caller) caller->startTimer("contraction: spin trace");
    int pd = grid->_processors[orthogdim];
    int pc = grid->_processor_coor[orthogdim];
    parallel_for_nest2(int lt=0;lt<ld;lt++)
    {
        for(int pt=0;pt<pd;pt++)
        {
            int t = lt + pt*ld;
            if (pt == pc)
            {
                for(int i=0;i<Lblock;i++)
                for(int j=0;j<Rblock;j++)
                for(int m=0;m<Nmom;m++)
                {
                    int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
                    for(int mu=0;mu<Ngamma;mu++)
                    {
                        // this is a bit slow
                        mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gamma[mu]));
                    }
                }
            } 
            else 
            { 
                const scalar_type zz(0.0);
                for(int i=0;i<Lblock;i++)
                for(int j=0;j<Rblock;j++)
                for(int mu=0;mu<Ngamma;mu++)
                for(int m=0;m<Nmom;m++)
                {
                    mat(m,mu,t,i,j) =zz;
                }
            }
        }
    }
    if (caller) caller->stopTimer("contraction: spin trace");
    ////////////////////////////////////////////////////////////////////
    // This global sum is taking as much as 50% of time on 16 nodes
    // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
    // Healthy size that should suffice
    ////////////////////////////////////////////////////////////////////
    if (caller) caller->startTimer("contraction: global sum");
    grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
    if (caller) caller->stopTimer("contraction: global sum");
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif //Hadrons_MContraction_A2AMesonField_hpp_
--- a/Hadrons/Modules/MContraction/Meson.hpp
+++ b/Hadrons/Modules/MContraction/Meson.hpp
@@ -77,7 +77,7 @@ class TMeson: public Module<MesonPar>
 public:
    FERM_TYPE_ALIASES(FImpl1, 1);
    FERM_TYPE_ALIASES(FImpl2, 2);
-    FERM_TYPE_ALIASES(ScalarImplCR, Scalar);
+    BASIC_TYPE_ALIASES(ScalarImplCR, Scalar);
    SINK_TYPE_ALIASES(Scalar);
    class Result: Serializable
    {
--- a/Hadrons/Modules/MFermion/FreeProp.hpp
+++ b/Hadrons/Modules/MFermion/FreeProp.hpp
@@ -56,7 +56,7 @@ template <typename FImpl>
 class TFreeProp: public Module<FreePropPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TFreeProp(const std::string name);
--- a/Hadrons/Modules/MFermion/GaugeProp.hpp
+++ b/Hadrons/Modules/MFermion/GaugeProp.hpp
@@ -56,7 +56,7 @@ template <typename FImpl>
 class TGaugeProp: public Module<GaugePropPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
    SOLVER_TYPE_ALIASES(FImpl,);
 public:
    // constructor
--- a/Hadrons/Modules/MGauge/Random.cc
+++ b/Hadrons/Modules/MGauge/Random.cc
@@ -25,47 +25,10 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MGauge/Random.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;
-/******************************************************************************
+template class Grid::Hadrons::MGauge::TRandom<GIMPL>;
 *                           TRandom implementation                            *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TRandom::TRandom(const std::string name)
 : Module<NoPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TRandom::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 std::vector<std::string> TRandom::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TRandom::setup(void)
 {
    envCreateLat(LatticeGaugeField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TRandom::execute(void)
 {
    LOG(Message) << "Generating random gauge configuration" << std::endl;
    auto &U = envGet(LatticeGaugeField, getName());
    SU3::HotConfiguration(rng4d(), U);
 }
--- a/Hadrons/Modules/MGauge/Random.hpp
+++ b/Hadrons/Modules/MGauge/Random.hpp
@@ -40,8 +40,11 @@ BEGIN_HADRONS_NAMESPACE
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 template <typename GImpl>
 class TRandom: public Module<NoPar>
 {
 public:
    GAUGE_TYPE_ALIASES(GImpl,);
 public:
    // constructor
    TRandom(const std::string name);
@@ -57,7 +60,50 @@ protected:
    virtual void execute(void);
 };
-MODULE_REGISTER(Random, TRandom, MGauge);
+MODULE_REGISTER_TMP(Random, TRandom<GIMPL>, MGauge);
 /******************************************************************************
 *                           TRandom implementation                            *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename GImpl>
 TRandom<GImpl>::TRandom(const std::string name)
 : Module<NoPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename GImpl>
 std::vector<std::string> TRandom<GImpl>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <typename GImpl>
 std::vector<std::string> TRandom<GImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename GImpl>
 void TRandom<GImpl>::setup(void)
 {
    envCreateLat(GaugeField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename GImpl>
 void TRandom<GImpl>::execute(void)
 {
    LOG(Message) << "Generating random gauge configuration" << std::endl;
    auto &U = envGet(GaugeField, getName());
    GImpl::HotConfiguration(rng4d(), U);
 }
 END_MODULE_NAMESPACE
--- a/Hadrons/Modules/MGauge/StoutSmearing.hpp
+++ b/Hadrons/Modules/MGauge/StoutSmearing.hpp
@@ -52,7 +52,7 @@ template <typename GImpl>
 class TStoutSmearing: public Module<StoutSmearingPar>
 {
 public:
-    typedef typename GImpl::Field GaugeField;
+    GAUGE_TYPE_ALIASES(GImpl,);
 public:
    // constructor
    TStoutSmearing(const std::string name);
--- a/Hadrons/Modules/MGauge/Unit.cc
+++ b/Hadrons/Modules/MGauge/Unit.cc
@@ -25,45 +25,10 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MGauge/Unit.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;
-/******************************************************************************
+template class Grid::Hadrons::MGauge::TUnit<GIMPL>;
 *                            TUnit implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TUnit::TUnit(const std::string name)
 : Module<NoPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TUnit::getInput(void)
 {
    return std::vector<std::string>();
 }
 std::vector<std::string> TUnit::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TUnit::setup(void)
 {
    envCreateLat(LatticeGaugeField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TUnit::execute(void)
 {
    LOG(Message) << "Creating unit gauge configuration" << std::endl;
    auto &U = envGet(LatticeGaugeField, getName());
    SU3::ColdConfiguration(rng4d(), U);
 }
--- a/Hadrons/Modules/MGauge/Unit.hpp
+++ b/Hadrons/Modules/MGauge/Unit.hpp
@@ -40,8 +40,11 @@ BEGIN_HADRONS_NAMESPACE
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 template <typename GImpl>
 class TUnit: public Module<NoPar>
 {
 public:
    GAUGE_TYPE_ALIASES(GImpl,);
 public:
    // constructor
    TUnit(const std::string name);
@@ -57,7 +60,48 @@ protected:
    virtual void execute(void);
 };
-MODULE_REGISTER(Unit, TUnit, MGauge);
+MODULE_REGISTER_TMP(Unit, TUnit<GIMPL>, MGauge);
 /******************************************************************************
 *                            TUnit implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename GImpl>
 TUnit<GImpl>::TUnit(const std::string name)
 : Module<NoPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename GImpl>
 std::vector<std::string> TUnit<GImpl>::getInput(void)
 {
    return std::vector<std::string>();
 }
 template <typename GImpl>
 std::vector<std::string> TUnit<GImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename GImpl>
 void TUnit<GImpl>::setup(void)
 {
    envCreateLat(GaugeField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename GImpl>
 void TUnit<GImpl>::execute(void)
 {
    LOG(Message) << "Creating unit gauge configuration" << std::endl;
    auto &U = envGet(GaugeField, getName());
    GImpl::ColdConfiguration(rng4d(), U);
 }
 END_MODULE_NAMESPACE
--- a/Hadrons/Modules/MIO/LoadEigenPack.cc
+++ b/Hadrons/Modules/MIO/LoadEigenPack.cc
@@ -32,4 +32,4 @@ using namespace Hadrons;
 using namespace MIO;
 template class Grid::Hadrons::MIO::TLoadEigenPack<FermionEigenPack<FIMPL>>;
-
+template class Grid::Hadrons::MIO::TLoadEigenPack<FermionEigenPack<FIMPL, FIMPLF>>;
--- a/Hadrons/Modules/MIO/LoadEigenPack.hpp
+++ b/Hadrons/Modules/MIO/LoadEigenPack.hpp
@@ -54,7 +54,9 @@ template <typename Pack>
 class TLoadEigenPack: public Module<LoadEigenPackPar>
 {
 public:
-    typedef EigenPack<typename Pack::Field> BasePack;
+    typedef typename Pack::Field   Field;
    typedef typename Pack::FieldIo FieldIo;
    typedef BaseEigenPack<Field>   BasePack;
 public:
    // constructor
    TLoadEigenPack(const std::string name);
@@ -70,6 +72,7 @@ public:
 };
 MODULE_REGISTER_TMP(LoadFermionEigenPack, TLoadEigenPack<FermionEigenPack<FIMPL>>, MIO);
 MODULE_REGISTER_TMP(LoadFermionEigenPackIo32, ARG(TLoadEigenPack<FermionEigenPack<FIMPL, FIMPLF>>), MIO);
 /******************************************************************************
 *                    TLoadEigenPack implementation                           *
@@ -101,9 +104,14 @@ std::vector<std::string> TLoadEigenPack<Pack>::getOutput(void)
 template <typename Pack>
 void TLoadEigenPack<Pack>::setup(void)
 {
-    env().createGrid(par().Ls);
+    GridBase *gridIo = nullptr;
    if (typeHash<Field>() != typeHash<FieldIo>())
    {
        gridIo = envGetRbGrid(FieldIo, par().Ls);
    }
    envCreateDerived(BasePack, Pack, getName(), par().Ls, par().size, 
-                     env().getRbGrid(par().Ls));
+                     envGetRbGrid(Field, par().Ls), gridIo);
 }
 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/Modules/MIO/LoadNersc.cc
+++ b/Hadrons/Modules/MIO/LoadNersc.cc
@@ -31,44 +31,4 @@ using namespace Grid;
 using namespace Hadrons;
 using namespace MIO;
-/******************************************************************************
+template class Grid::Hadrons::MIO::TLoadNersc<GIMPL>;
 *                       TLoadNersc implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TLoadNersc::TLoadNersc(const std::string name)
 : Module<LoadNerscPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TLoadNersc::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 std::vector<std::string> TLoadNersc::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TLoadNersc::setup(void)
 {
    envCreateLat(LatticeGaugeField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TLoadNersc::execute(void)
 {
    FieldMetaData header;
    std::string   fileName = par().file + "."
                             + std::to_string(vm().getTrajectory());
    LOG(Message) << "Loading NERSC configuration from file '" << fileName
                 << "'" << std::endl;
    auto &U = envGet(LatticeGaugeField, getName());
    NerscIO::readConfiguration(U, header, fileName);
 }
--- a/Hadrons/Modules/MIO/LoadNersc.hpp
+++ b/Hadrons/Modules/MIO/LoadNersc.hpp
@@ -46,8 +46,11 @@ public:
                                    std::string, file);
 };
 template <typename GImpl>
 class TLoadNersc: public Module<LoadNerscPar>
 {
 public:
    GAUGE_TYPE_ALIASES(GImpl,);
 public:
    // constructor
    TLoadNersc(const std::string name);
@@ -62,7 +65,54 @@ public:
    virtual void execute(void);
 };
-MODULE_REGISTER(LoadNersc, TLoadNersc, MIO);
+MODULE_REGISTER_TMP(LoadNersc,  TLoadNersc<GIMPL>,  MIO);
 /******************************************************************************
 *                       TLoadNersc implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename GImpl>
 TLoadNersc<GImpl>::TLoadNersc(const std::string name)
 : Module<LoadNerscPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename GImpl>
 std::vector<std::string> TLoadNersc<GImpl>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <typename GImpl>
 std::vector<std::string> TLoadNersc<GImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename GImpl>
 void TLoadNersc<GImpl>::setup(void)
 {
    envCreateLat(GaugeField, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename GImpl>
 void TLoadNersc<GImpl>::execute(void)
 {
    FieldMetaData header;
    std::string   fileName = par().file + "."
                             + std::to_string(vm().getTrajectory());
    LOG(Message) << "Loading NERSC configuration from file '" << fileName
                 << "'" << std::endl;
    auto &U = envGet(GaugeField, getName());
    NerscIO::readConfiguration(U, header, fileName);
 }
 END_MODULE_NAMESPACE
--- a/Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp
+++ b/Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp
@@ -43,6 +43,8 @@ BEGIN_MODULE_NAMESPACE(MNoise)
 template <typename FImpl>
 class TTimeDilutedSpinColorDiagonal: public Module<NoPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TTimeDilutedSpinColorDiagonal(const std::string name);
@@ -92,7 +94,7 @@ void TTimeDilutedSpinColorDiagonal<FImpl>::setup(void)
 {
    envCreateDerived(DilutedNoise<FImpl>, 
                     TimeDilutedSpinColorDiagonalNoise<FImpl>,
-                     getName(), 1, env().getGrid());
+                     getName(), 1, envGetGrid(FermionField));
 }
 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/Modules/MScalar/ChargedProp.hpp
+++ b/Hadrons/Modules/MScalar/ChargedProp.hpp
@@ -55,7 +55,7 @@ public:
 class TChargedProp: public Module<ChargedPropPar>
 {
 public:
-    SCALAR_TYPE_ALIASES(SIMPL,);
+    BASIC_TYPE_ALIASES(SIMPL,);
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
    class Result: Serializable
--- a/Hadrons/Modules/MScalar/FreeProp.hpp
+++ b/Hadrons/Modules/MScalar/FreeProp.hpp
@@ -51,7 +51,7 @@ public:
 class TFreeProp: public Module<FreePropPar>
 {
 public:
-    SCALAR_TYPE_ALIASES(SIMPL,);
+    BASIC_TYPE_ALIASES(SIMPL,);
 public:
    // constructor
    TFreeProp(const std::string name);
--- a/Hadrons/Modules/MScalar/ScalarVP.hpp
+++ b/Hadrons/Modules/MScalar/ScalarVP.hpp
@@ -53,7 +53,7 @@ public:
 class TScalarVP: public Module<ScalarVPPar>
 {
 public:
-    SCALAR_TYPE_ALIASES(SIMPL,);
+    BASIC_TYPE_ALIASES(SIMPL,);
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
    class Result: Serializable
--- a/Hadrons/Modules/MScalar/VPCounterTerms.hpp
+++ b/Hadrons/Modules/MScalar/VPCounterTerms.hpp
@@ -53,7 +53,7 @@ public:
 class TVPCounterTerms: public Module<VPCounterTermsPar>
 {
 public:
-    SCALAR_TYPE_ALIASES(SIMPL,);
+    BASIC_TYPE_ALIASES(SIMPL,);
    class Result: Serializable
    {
    public:
--- a/Hadrons/Modules/MScalarSUN/StochFreeField.hpp
+++ b/Hadrons/Modules/MScalarSUN/StochFreeField.hpp
@@ -132,7 +132,7 @@ void TStochFreeField<SImpl>::execute(void)
    auto    &w                = envGet(ComplexField, "_" + getName() + "_weight");
    auto    &rng              = rng4d();
    double  trphi2;
-    FFT     fft(env().getGrid());
+    FFT     fft(envGetGrid(Field));
    Integer vol;
    vol = 1;
@@ -169,11 +169,6 @@ void TStochFreeField<SImpl>::execute(void)
    phi = 0.5*(phi - adj(phi));
    trphi2 = -TensorRemove(sum(trace(phi*phi))).real()/vol;
    LOG(Message) << "tr(phi^2)= " << trphi2 << std::endl;
    // ComplexField phi2(env().getGrid());
    // phi2=trace(phi*phi);
    // std::cout << phi2 << std::endl;
 }
 END_MODULE_NAMESPACE
--- a/Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp
+++ b/Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp
@@ -146,7 +146,7 @@ void TTimeMomProbe<SImpl>::execute(void)
    std::set<std::vector<int>>                   timeMomSet;
    std::vector<std::vector<std::vector<int>>>   timeMom;
    std::vector<std::vector<int>>                transferMom;
-    FFT                                          fft(env().getGrid());
+    FFT                                          fft(envGetGrid(Field));
    std::vector<int>                             dMask(nd, 1);
    std::vector<TimeMomProbeResult>              result;
    std::map<std::string, std::vector<SlicedOp>> slicedOp;
--- a/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
+++ b/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
@@ -126,7 +126,7 @@ void TTrKinetic<SImpl>::setup(void)
        envCreateLat(ComplexField, varName(getName(), mu, nu));
    }
    envCreateLat(ComplexField, varName(getName(), "sum"));
-    envTmp(std::vector<Field>, "der", 1, env().getNd(), env().getGrid());
+    envTmp(std::vector<Field>, "der", 1, env().getNd(), envGetGrid(Field));
 }
 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
+++ b/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
@@ -168,7 +168,7 @@ void TTwoPoint<SImpl>::execute(void)
    std::set<std::string>                        ops;
    std::vector<TwoPointResult>                  result;
    std::map<std::string, std::vector<SlicedOp>> slicedOp;
-    FFT                                          fft(env().getGrid());
+    FFT                                          fft(envGetGrid(Field));
    TComplex                                     buf;
    envGetTmp(ComplexField, ftBuf);
--- a/Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp
+++ b/Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp
@@ -136,7 +136,7 @@ void TTwoPointNPR<SImpl>::execute(void)
    const unsigned int             nd   = env().getNd();
    const unsigned int             nl   = env().getDim(0);
    const Real                     invV = 1./env().getVolume();
-    FFT                            fft(env().getGrid());
+    FFT                            fft(envGetGrid(Field));
    std::vector<TwoPointNPRResult> result;
    TwoPointNPRResult              twoPtp1, twoPtp2, twoPtDisc;
    auto                           &phi    = envGet(Field, par().field);
--- a/Hadrons/Modules/MSink/Point.hpp
+++ b/Hadrons/Modules/MSink/Point.hpp
@@ -53,7 +53,7 @@ template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl,);
+    BASIC_TYPE_ALIASES(FImpl,);
    SINK_TYPE_ALIASES();
 public:
    // constructor
@@ -132,7 +132,7 @@ void TPoint<FImpl>::execute(void)
        for(unsigned int mu = 0; mu < p.size(); mu++)
        {
            LatticeCoordinate(coor, mu);
-            ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
+            ph = ph + (p[mu]/env().getDim(mu))*coor;
        }
        ph = exp((Real)(2*M_PI)*i*ph);
        hasPhase_ = true;
--- a/Hadrons/Modules/MSolver/A2AVectors.cc
+++ b/Hadrons/Modules/MSolver/A2AVectors.cc
@@ -32,5 +32,5 @@ using namespace Grid;
 using namespace Hadrons;
 using namespace MSolver;
-template class Grid::Hadrons::MSolver::TA2AVectors<FIMPL, FermionEigenPack<FIMPL>>;
+template class Grid::Hadrons::MSolver::TA2AVectors<FIMPL, BaseFermionEigenPack<FIMPL>>;
-template class Grid::Hadrons::MSolver::TA2AVectors<ZFIMPL, FermionEigenPack<ZFIMPL>>;
+template class Grid::Hadrons::MSolver::TA2AVectors<ZFIMPL, BaseFermionEigenPack<ZFIMPL>>;
--- a/Hadrons/Modules/MSolver/A2AVectors.hpp
+++ b/Hadrons/Modules/MSolver/A2AVectors.hpp
@@ -79,9 +79,9 @@ private:
 };
 MODULE_REGISTER_TMP(A2AVectors, 
-    ARG(TA2AVectors<FIMPL, FermionEigenPack<FIMPL>>), MSolver);
+    ARG(TA2AVectors<FIMPL, BaseFermionEigenPack<FIMPL>>), MSolver);
 MODULE_REGISTER_TMP(ZA2AVectors, 
-    ARG(TA2AVectors<ZFIMPL, FermionEigenPack<ZFIMPL>>), MSolver);
+    ARG(TA2AVectors<ZFIMPL, BaseFermionEigenPack<ZFIMPL>>), MSolver);
 /******************************************************************************
 *                       TA2AVectors implementation                           *
@@ -135,9 +135,9 @@ void TA2AVectors<FImpl, Pack>::setup(void)
        Nl_ = epack.evec.size();
    }
    envCreate(std::vector<FermionField>, getName() + "_v", 1, 
-              Nl_ + noise.size(), FermionField(env().getGrid()));
+              Nl_ + noise.size(), envGetGrid(FermionField));
    envCreate(std::vector<FermionField>, getName() + "_w", 1, 
-              Nl_ + noise.size(), FermionField(env().getGrid()));
+              Nl_ + noise.size(), envGetGrid(FermionField));
    if (Ls > 1)
    {
        envTmpLat(FermionField, "f5", Ls);
--- a/Hadrons/Modules/MSolver/Guesser.hpp
+++ b/Hadrons/Modules/MSolver/Guesser.hpp
@@ -39,7 +39,7 @@ std::shared_ptr<LinearFunction<typename FImpl::FermionField>>
 makeGuesser(const std::string epackName)
 {
    typedef typename FImpl::FermionField                  FermionField;
-    typedef FermionEigenPack<FImpl>                       EPack;
+    typedef BaseFermionEigenPack<FImpl>                   EPack;
    typedef CoarseFermionEigenPack<FImpl, nBasis>         CoarseEPack;
    typedef DeflatedGuesser<FermionField>                 FineGuesser;
    typedef LocalCoherenceDeflatedGuesser<
--- a/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
+++ b/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
@@ -63,7 +63,7 @@ public:
    typedef LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
                                  typename FImpl::SiteComplex, 
                                  nBasis>                LCL;
-    typedef FermionEigenPack<FImpl>                      BasePack;
+    typedef BaseFermionEigenPack<FImpl>                  BasePack;
    typedef CoarseFermionEigenPack<FImpl, nBasis>        CoarsePack;
    typedef HADRONS_DEFAULT_SCHUR_OP<FMat, FermionField> SchurFMat;
 public:
--- a/Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp
+++ b/Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp
@@ -58,8 +58,8 @@ template <typename FImplInner, typename FImplOuter, int nBasis>
 class TMixedPrecisionRBPrecCG: public Module<MixedPrecisionRBPrecCGPar>
 {
 public:
-    FG_TYPE_ALIASES(FImplInner, Inner);
+    FERM_TYPE_ALIASES(FImplInner, Inner);
-    FG_TYPE_ALIASES(FImplOuter, Outer);
+    FERM_TYPE_ALIASES(FImplOuter, Outer);
    SOLVER_TYPE_ALIASES(FImplOuter,);
    typedef HADRONS_DEFAULT_SCHUR_OP<FMatInner, FermionFieldInner> SchurFMatInner;
    typedef HADRONS_DEFAULT_SCHUR_OP<FMatOuter, FermionFieldOuter> SchurFMatOuter;
@@ -170,7 +170,7 @@ void TMixedPrecisionRBPrecCG<FImplInner, FImplOuter, nBasis>
            MixedPrecisionConjugateGradient<FermionFieldOuter, FermionFieldInner> 
                mpcg(par().residual, par().maxInnerIteration, 
                     par().maxOuterIteration, 
-                     env().template getGrid<VTypeInner>(Ls),
+                     env().template getRbGrid<VTypeInner>(Ls),
                     simat, somat);
            OperatorFunctionWrapper<FermionFieldOuter> wmpcg(mpcg);
            HADRONS_DEFAULT_SCHUR_SOLVE<FermionFieldOuter> schurSolver(wmpcg);
--- a/Hadrons/Modules/MSolver/RBPrecCG.hpp
+++ b/Hadrons/Modules/MSolver/RBPrecCG.hpp
@@ -58,7 +58,7 @@ template <typename FImpl, int nBasis>
 class TRBPrecCG: public Module<RBPrecCGPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
    SOLVER_TYPE_ALIASES(FImpl,);
 public:
    // constructor
--- a/Hadrons/Modules/MSource/Point.hpp
+++ b/Hadrons/Modules/MSource/Point.hpp
@@ -63,7 +63,7 @@ template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl,);
+    BASIC_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TPoint(const std::string name);
@@ -126,6 +126,11 @@ void TPoint<FImpl>::execute(void)
    auto             &src     = envGet(PropagatorField, getName());
    SitePropagator   id;
    if (position.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "position has " + std::to_string(position.size())
                      + " components (must have " + std::to_string(env().getNd()) + ")");
    }
    id  = 1.;
    src = zero;
    pokeSite(id, src, position);
--- a/Hadrons/Modules/MSource/SeqConserved.hpp
+++ b/Hadrons/Modules/MSource/SeqConserved.hpp
@@ -187,7 +187,7 @@ void TSeqConserved<FImpl>::execute(void)
        for(unsigned int mu = 0; mu < env().getNd(); mu++)
        {
            LatticeCoordinate(coor, mu);
-            mom_phase = mom_phase + (mom[mu]/env().getGrid()->_fdimensions[mu])*coor;
+            mom_phase = mom_phase + (mom[mu]/env().getDim(mu))*coor;
        }
        mom_phase = exp((Real)(2*M_PI)*i*mom_phase);
        SeqhasPhase_ = true;
--- a/Hadrons/Modules/MSource/SeqGamma.hpp
+++ b/Hadrons/Modules/MSource/SeqGamma.hpp
@@ -71,7 +71,7 @@ template <typename FImpl>
 class TSeqGamma: public Module<SeqGammaPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TSeqGamma(const std::string name);
@@ -125,7 +125,7 @@ template <typename FImpl>
 void TSeqGamma<FImpl>::setup(void)
 {
    envCreateLat(PropagatorField, getName());
-    envCacheLat(Lattice<iScalar<vInteger>>, tName_);
+    envCache(Lattice<iScalar<vInteger>>, tName_, 1, envGetGrid(LatticeComplex));
    envCacheLat(LatticeComplex, momphName_);
    envTmpLat(LatticeComplex, "coor");
 }
@@ -162,7 +162,7 @@ void TSeqGamma<FImpl>::execute(void)
        for(unsigned int mu = 0; mu < env().getNd(); mu++)
        {
            LatticeCoordinate(coor, mu);
-            ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
+            ph = ph + (p[mu]/env().getDim(mu))*coor;
        }
        ph = exp((Real)(2*M_PI)*i*ph);
        LatticeCoordinate(t, Tp);
--- a/Hadrons/Modules/MSource/Wall.hpp
+++ b/Hadrons/Modules/MSource/Wall.hpp
@@ -143,7 +143,7 @@ void TWall<FImpl>::execute(void)
        for(unsigned int mu = 0; mu < env().getNd(); mu++)
        {
            LatticeCoordinate(coor, mu);
-            ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
+            ph = ph + (p[mu]/env().getDim(mu))*coor;
        }
        ph = exp((Real)(2*M_PI)*i*ph);
        LatticeCoordinate(t, Tp);
--- a/Hadrons/Modules/MSource/Z2.hpp
+++ b/Hadrons/Modules/MSource/Z2.hpp
@@ -66,7 +66,7 @@ template <typename FImpl>
 class TZ2: public Module<Z2Par>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl,);
+    BASIC_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TZ2(const std::string name);
@@ -120,7 +120,7 @@ template <typename FImpl>
 void TZ2<FImpl>::setup(void)
 {
    envCreateLat(PropagatorField, getName());
-    envCacheLat(Lattice<iScalar<vInteger>>, tName_);
+    envCache(Lattice<iScalar<vInteger>>, tName_, 1, envGetGrid(LatticeComplex));
    envTmpLat(LatticeComplex, "eta");
 }
--- a/Hadrons/Modules/MUtilities/PrecisionCast.cc
+++ b/Hadrons/Modules/MUtilities/PrecisionCast.cc
@@ -0,0 +1,35 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MUtilities/PrecisionCast.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MUtilities;
 template class Grid::Hadrons::MUtilities::TPrecisionCast<GIMPLD::GaugeField, GIMPLF::GaugeField>;
 template class Grid::Hadrons::MUtilities::TPrecisionCast<FIMPLD::FermionField, FIMPLF::FermionField>;
--- a/Hadrons/Modules/MUtilities/PrecisionCast.hpp
+++ b/Hadrons/Modules/MUtilities/PrecisionCast.hpp
@@ -0,0 +1,124 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MUtilities/PrecisionCast.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MUtilities_PrecisionCast_hpp_
 #define Hadrons_MUtilities_PrecisionCast_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                          Precision cast module                             *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MUtilities)
 class PrecisionCastPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(PrecisionCastPar,
                                    std::string, field);
 };
 template <typename FieldIn, typename FieldOut>
 class TPrecisionCast: public Module<PrecisionCastPar>
 {
 public:
    // constructor
    TPrecisionCast(const std::string name);
    // destructor
    virtual ~TPrecisionCast(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(GaugeSinglePrecisionCast, 
                    ARG(TPrecisionCast<GIMPLD::GaugeField, GIMPLF::GaugeField>),
                    MUtilities);
 MODULE_REGISTER_TMP(FermionSinglePrecisionCast, 
                    ARG(TPrecisionCast<FIMPLD::FermionField, FIMPLF::FermionField>),
                    MUtilities);
 /******************************************************************************
 *                     TPrecisionCast implementation                          *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FieldIn, typename FieldOut>
 TPrecisionCast<FieldIn, FieldOut>::TPrecisionCast(const std::string name)
 : Module<PrecisionCastPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FieldIn, typename FieldOut>
 std::vector<std::string> TPrecisionCast<FieldIn, FieldOut>::getInput(void)
 {
    std::vector<std::string> in = {par().field};
    return in;
 }
 template <typename FieldIn, typename FieldOut>
 std::vector<std::string> TPrecisionCast<FieldIn, FieldOut>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FieldIn, typename FieldOut>
 void TPrecisionCast<FieldIn, FieldOut>::setup(void)
 {
    envCreateLat(FieldOut, getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FieldIn, typename FieldOut>
 void TPrecisionCast<FieldIn, FieldOut>::execute(void)
 {
    LOG(Message) << "Casting field '" << par().field << "'" << std::endl;
    LOG(Message) << "In  type: " << typeName<FieldIn>() << std::endl;
    LOG(Message) << "Out type: " << typeName<FieldOut>() << std::endl;
    auto &in  = envGet(FieldIn,  par().field);
    auto &out = envGet(FieldOut, getName());
    precisionChange(out, in);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MUtilities_PrecisionCast_hpp_
--- a/Hadrons/Modules/MUtilities/RandomVectors.hpp
+++ b/Hadrons/Modules/MUtilities/RandomVectors.hpp
@@ -96,8 +96,15 @@ std::vector<std::string> TRandomVectors<Field>::getOutput(void)
 template <typename Field>
 void TRandomVectors<Field>::setup(void)
 {
-    envCreate(std::vector<Field>, getName(), par().Ls, par().size, 
+    if (par().Ls > 1)
-              env().getGrid(par().Ls));
+    {
        envCreate(std::vector<Field>, getName(), par().Ls, par().size, 
                  envGetGrid(Field, par().Ls));
    }
    else
    {
        envCreate(std::vector<Field>, getName(), 1, par().size, envGetGrid(Field));
    }
 }
 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/TimerArray.cc
+++ b/Hadrons/TimerArray.cc
@@ -0,0 +1,99 @@
 #include <Hadrons/TimerArray.hpp>
 using namespace Grid;
 using namespace QCD;
 using namespace Hadrons;
 void TimerArray::startTimer(const std::string &name)
 {
    if (!name.empty())
    {
        timer_[name].Start();
    }
 }
 GridTime TimerArray::getTimer(const std::string &name)
 {
    GridTime t;
    if (!name.empty())
    {
        try
        {
            bool running = timer_.at(name).isRunning();
            if (running) stopTimer(name);
            t = timer_.at(name).Elapsed();
            if (running) startTimer(name);
        }
        catch (std::out_of_range &)
        {
            t = GridTime::zero();
        }
    }
    else
    {
        t = GridTime::zero();
    }
    return t;
 }
 double TimerArray::getDTimer(const std::string &name)
 {
    return static_cast<double>(getTimer(name).count());
 }
 void TimerArray::startCurrentTimer(const std::string &name)
 {
    if (!name.empty())
    {
        stopCurrentTimer();
        startTimer(name);
        currentTimer_ = name;
    }
 }
 void TimerArray::stopTimer(const std::string &name)
 {
    if (timer_.at(name).isRunning())
    {
        timer_.at(name).Stop();
    }
 }
 void TimerArray::stopCurrentTimer(void)
 {
    if (!currentTimer_.empty())
    {
        stopTimer(currentTimer_);
        currentTimer_ = "";
    }
 }
 void TimerArray::stopAllTimers(void)
 {
    for (auto &t: timer_)
    {
        stopTimer(t.first);
    }
    currentTimer_ = "";
 }
 void TimerArray::resetTimers(void)
 {
    timer_.clear();
    currentTimer_ = "";
 }
 std::map<std::string, GridTime> TimerArray::getTimings(void)
 {
    std::map<std::string, GridTime> timing;
    for (auto &t: timer_)
    {
        timing[t.first] = t.second.Elapsed();
    }
    return timing;
 }
--- a/Hadrons/TimerArray.hpp
+++ b/Hadrons/TimerArray.hpp
@@ -0,0 +1,29 @@
 #ifndef Hadrons_TimerArray_hpp_
 #define Hadrons_TimerArray_hpp_
 #include <Hadrons/Global.hpp>
 BEGIN_HADRONS_NAMESPACE
 class TimerArray
 {
 public:
    TimerArray(void) = default;
    virtual ~TimerArray(void) = default;
    void                            startTimer(const std::string &name);
    GridTime                        getTimer(const std::string &name);
    double                          getDTimer(const std::string &name);
    void                            startCurrentTimer(const std::string &name);
    void                            stopTimer(const std::string &name);
    void                            stopCurrentTimer(void);
    void                            stopAllTimers(void);
    void                            resetTimers(void);
    std::map<std::string, GridTime> getTimings(void);
 private:
    std::string                          currentTimer_;
    std::map<std::string, GridStopWatch> timer_; 
 };
 END_HADRONS_NAMESPACE
 #endif // Hadrons_TimerArray_hpp_
--- a/Hadrons/Utilities/EigenPackCast.cc
+++ b/Hadrons/Utilities/EigenPackCast.cc
@@ -0,0 +1,164 @@
 #include <Hadrons/EigenPack.hpp>
 #include <Hadrons/Environment.hpp>
 using namespace Grid;
 using namespace QCD;
 using namespace Hadrons;
 template <typename FOut, typename FIn>
 void convert(const std::string outFilename, const std::string inFilename, 
             const unsigned int Ls, const bool rb, const unsigned int size, 
             const bool multiFile)
 {
    assert(outFilename != inFilename);
    typedef EigenPack<FOut>            EPOut;
    typedef EigenPack<FIn>             EPIn;
    typedef typename FOut::vector_type VTypeOut;
    typedef typename FIn::vector_type  VTypeIn;
    std::shared_ptr<GridCartesian>         gInBase, gOutBase, gIn5, gOut5;
    std::shared_ptr<GridRedBlackCartesian> rbgIn, rbgOut;
    GridBase                               *gIn, *gOut;
    auto         dim     = GridDefaultLatt();
    unsigned int nd      = dim.size();
    auto         simdOut = GridDefaultSimd(nd, VTypeOut::Nsimd());
    auto         simdIn  = GridDefaultSimd(nd, VTypeIn::Nsimd());
    gOutBase.reset(SpaceTimeGrid::makeFourDimGrid(dim, simdOut, GridDefaultMpi()));
    gInBase.reset(SpaceTimeGrid::makeFourDimGrid(dim, simdIn, GridDefaultMpi()));
    if (rb)
    {
        if (Ls > 1)
        {
            rbgOut.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gOutBase.get()));
            rbgIn.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gInBase.get()));
        }
        else
        {
            rbgOut.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gOutBase.get()));
            rbgIn.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gInBase.get()));
        }
        gOut = rbgOut.get();
        gIn  = rbgIn.get();
    }
    else
    {
        if (Ls > 1)
        {
            gOut5.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gOutBase.get()));
            gIn5.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gInBase.get()));
            gOut = gOut5.get();
            gIn  = gIn5.get();
        }
        else
        {
            gOut = gOutBase.get();
            gIn  = gInBase.get();
        }
    }
    FOut         bufOut(gOut);
    FIn          bufIn(gIn), testIn(gIn);
    ScidacWriter binWriter(gOut->IsBoss());
    ScidacReader binReader;
    PackRecord   record;
    RealD        eval;
    LOG(Message) << "==== EIGENPACK CONVERSION" << std::endl;
    LOG(Message) << "Lattice       : " << gIn->GlobalDimensions() << std::endl;
    LOG(Message) << "Checkerboarded: " << (rb ? "yes" : "no") << std::endl;
    LOG(Message) << "In path       : " << inFilename  << std::endl;
    LOG(Message) << "In type       : " << typeName<FIn>() << std::endl;
    LOG(Message) << "Out path      : " << outFilename << std::endl;
    LOG(Message) << "Out type      : " << typeName<FOut>() << std::endl;
    LOG(Message) << "#vectors      : " << size << std::endl;
    LOG(Message) << "Multifile     : " << (multiFile ? "yes" : "no") << std::endl;
    if (multiFile)
    {
        for(unsigned int k = 0; k < size; ++k)
        {
            std::string  outV = outFilename + "/v" + std::to_string(k) + ".bin";
            std::string  inV  = inFilename + "/v" + std::to_string(k) + ".bin";
            LOG(Message) << "==== Converting vector " << k << std::endl;
            LOG(Message) << "In : " << inV  << std::endl;
            LOG(Message) << "Out: " << outV << std::endl;
            makeFileDir(outV, gOut);
            binWriter.open(outV);
            binReader.open(inV);
            EigenPackIo::readHeader(record, binReader);
            EigenPackIo::writeHeader(binWriter, record);
            EigenPackIo::readElement<FIn>(bufIn, eval, k, binReader);
            EigenPackIo::writeElement<FIn, FOut>(binWriter, bufIn, eval, k, &bufOut, &testIn);
            binWriter.close();
            binReader.close();
        }
    }
    else
    {
        makeFileDir(outFilename, gOut);
        binWriter.open(outFilename);
        binReader.open(inFilename);
        EigenPackIo::readHeader(record, binReader);
        EigenPackIo::writeHeader(binWriter, record);
        for(unsigned int k = 0; k < size; ++k)
        {
            EigenPackIo::readElement<FIn>(bufIn, eval, k, binReader);
            EigenPackIo::writeElement<FIn, FOut>(binWriter, bufIn, eval, k, &bufOut, &testIn);
        }
        binWriter.close();
        binReader.close();
    }
 }
 #ifndef FOUT
 #warning "FOUT undefined (set to WilsonImplF::FermionField by default)"
 #define FOUT WilsonImplF::FermionField
 #endif
 #ifndef FIN
 #warning "FIN undefined (set to WilsonImplD::FermionField by default)"
 #define FIN WilsonImplD::FermionField
 #endif
 int main(int argc, char *argv[])
 {
    // parse command line
    std::string  outFilename, inFilename;
    unsigned int size, Ls;
    bool         rb, multiFile;
    if (argc < 7)
    {
        std::cerr << "usage: " << argv[0] << " <out eigenpack> <in eigenpack> <Ls> <red-black (0|1)> <#vector> <multifile (0|1)> [Grid options]";
        std::cerr << std::endl;
        std::exit(EXIT_FAILURE);
    }
    outFilename = argv[1];
    inFilename  = argv[2];
    Ls          = std::stoi(std::string(argv[3]));
    rb          = (std::string(argv[4]) != "0");
    size        = std::stoi(std::string(argv[5]));
    multiFile   = (std::string(argv[6]) != "0");
    // initialization
    Grid_init(&argc, &argv);
    initLogger();
    // execution
    try
    {
        convert<FOUT, FIN>(outFilename, inFilename, Ls, rb, size, multiFile);
    }
    catch (const std::exception& e)
    {
        Exceptions::abort(e);
    }
    // epilogue
    LOG(Message) << "Grid is finalizing now" << std::endl;
    Grid_finalize();
    return EXIT_SUCCESS;
 }
--- a/Hadrons/Utilities/HadronsXmlRun.cc
+++ b/Hadrons/Utilities/HadronsXmlRun.cc
--- a/Hadrons/Utilities/Makefile.am
+++ b/Hadrons/Utilities/Makefile.am
@@ -0,0 +1,8 @@
 bin_PROGRAMS = HadronsXmlRun HadronsFermionEP64To32
 HadronsXmlRun_SOURCES = HadronsXmlRun.cc
 HadronsXmlRun_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
 HadronsFermionEP64To32_SOURCES  = EigenPackCast.cc
 HadronsFermionEP64To32_CXXFLAGS = $(AM_CXXFLAGS) -DFIN=WilsonImplD::FermionField -DFOUT=WilsonImplF::FermionField
 HadronsFermionEP64To32_LDADD    = ../libHadrons.a ../../Grid/libGrid.a
--- a/Hadrons/modules.inc
+++ b/Hadrons/modules.inc
@@ -4,6 +4,7 @@ modules_cc =\
  Modules/MContraction/Meson.cc \
  Modules/MContraction/WeakNeutral4ptDisc.cc \
  Modules/MContraction/WeakHamiltonianNonEye.cc \
  Modules/MContraction/A2AAslashField.cc \
  Modules/MContraction/WardIdentity.cc \
  Modules/MContraction/A2AMesonField.cc \
  Modules/MContraction/DiscLoop.cc \
@@ -30,6 +31,7 @@ modules_cc =\
  Modules/MNoise/TimeDilutedSpinColorDiagonal.cc \
  Modules/MUtilities/RandomVectors.cc \
  Modules/MUtilities/TestSeqGamma.cc \
  Modules/MUtilities/PrecisionCast.cc \
  Modules/MUtilities/TestSeqConserved.cc \
  Modules/MLoop/NoiseLoop.cc \
  Modules/MScalar/FreeProp.cc \
@@ -62,8 +64,8 @@ modules_cc =\
 modules_hpp =\
  Modules/MContraction/Baryon.hpp \
  Modules/MContraction/A2AAslashField.hpp \
  Modules/MContraction/A2AMesonField.hpp \
  Modules/MContraction/A2AMesonFieldKernels.hpp \
  Modules/MContraction/Meson.hpp \
  Modules/MContraction/WeakHamiltonian.hpp \
  Modules/MContraction/WeakHamiltonianNonEye.hpp \
@@ -93,6 +95,7 @@ modules_hpp =\
  Modules/MGauge/FundtoHirep.hpp \
  Modules/MGauge/StochEm.hpp \
  Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp \
  Modules/MUtilities/PrecisionCast.hpp \
  Modules/MUtilities/RandomVectors.hpp \
  Modules/MUtilities/TestSeqGamma.hpp \
  Modules/MUtilities/TestSeqConserved.hpp \
--- a/configure.ac
+++ b/configure.ac
@@ -562,6 +562,7 @@ AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(tests/testu01/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_CONFIG_FILES(Hadrons/Makefile)
 AC_CONFIG_FILES(Hadrons/Utilities/Makefile)
 AC_OUTPUT
 echo ""
--- a/documentation/Grid.pdf
+++ b/documentation/Grid.pdf
--- a/documentation/_build/latex/Grid.pdf
+++ b/documentation/_build/latex/Grid.pdf
--- a/documentation/conf.py
+++ b/documentation/conf.py
@@ -80,7 +80,8 @@ primary_domain = 'cpp'
 # a list of builtin themes.
 #
 html_theme = 'alabaster'
-
+html_use_smartypants = False
 smart_quotes = False
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
--- a/documentation/interfacing.rst
+++ b/documentation/interfacing.rst
@@ -0,0 +1,232 @@
 Interfacing with external software
 ========================================
 Grid provides a number of important modules, such as solvers and
 eigensolvers, that are highly optimized for complex vector/SIMD
 architectures, such as the Intel Xeon Phi KNL and Skylake processors.
 This growing library, with appropriate interfacing, can be accessed
 from existing code. Here we describe interfacing issues and provide
 examples.
 MPI initialization
 --------------------
 Grid supports threaded MPI sends and receives and, if running with
 more than one thread, requires the MPI_THREAD_MULTIPLE mode of message
 passing. If the user initializes MPI before starting Grid, the
 appropriate initialization call is::
  MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &provided);
  assert(MPI_THREAD_MULTIPLE == provided);
 Grid Initialization
 ---------------------
 Grid itself is initialized with a call::
  Grid_init(&argc, &argv);
 Command line options include::
  --mpi n.n.n.n   : default MPI decomposition
  --threads n     : default number of OMP threads
  --grid n.n.n.n  : default Grid size
 where `argc` and `argv` are constructed to simulate the command-line
 options described above.  At a minimum one usually provides the
 `--grid` and `--mpi` parameters.  The former specifies the lattice
 dimensions and the latter specifies the grid of processors (MPI
 ranks).  If these parameters are not specified with the `Grid_init`
 call, they need to be supplied later when creating Grid fields.
 The following Grid procedures are useful for verifying that Grid
 "default" values are properly initialized.
 =============================================================   ===========================================================================================================
  Grid procedure                                                  returns 
 =============================================================   ===========================================================================================================
  std::vector<int> GridDefaultLatt();                            lattice size
  std::vector<int> GridDefaultSimd(int Nd,vComplex::Nsimd());    SIMD layout
  std::vector<int> GridDefaultMpi();                             MPI layout
  int Grid::GridThread::GetThreads();                            number of threads
 =============================================================   ===========================================================================================================
 MPI coordination
 ----------------
 Grid wants to use its own numbering of MPI ranks and its own
 assignment of the lattice coordinates with each rank.  Obviously, the
 calling program and Grid must agree on these conventions.  One should
 use Grid's Cartesian communicator class to discover the processor
 assignments. For a four-dimensional processor grid one can define::
  static Grid::CartesianCommunicator *grid_cart = NULL;
  grid_cart = new Grid::CartesianCommunicator(processors);
 where `processors` is of type `std::vector<int>`, with values matching
 the MPI processor-layout dimensions specified with the `--mpi`
 argument in the `Grid_Init` call.  Then each MPI rank can obtain its
 processor coordinate using the Cartesian communicator instantiated
 above.  For example, in four dimensions::
  std::vector<int> pePos(4);    
  for(int i=0; i<4; i++)
     pePos[i] = grid_cart->_processor_coor[i];
 and each MPI process can get its world rank from its processor
 coordinates using::
  int peRank = grid_cart->RankFromProcessorCoor(pePos)
 Conversely, each MPI process can get its processor coordinates from
 its world rank using::
  grid_cart->ProcessorCoorFromRank(peRank, pePos);
 If the calling program initialized MPI before initializing Grid, it is
 then important for each MPI process in the calling program to reset
 its rank number so it agrees with Grid::
   MPI_Comm comm;
   MPI_Comm_split(MPI_COMM_THISJOB,jobid,peRank,&comm);
   MPI_COMM_THISJOB = comm;
 where `MPI_COMM_THISJOB` is initially a copy of `MPI_COMM_WORLD` (with
 `jobid = 0`), or it is a split communicator with `jobid` equal to the
 index number of the subcommunicator.  Once this is done,::
  MPI_Comm_rank(MPI_COMM_THISJOB, &myrank);
 returns a rank that agrees with Grid's `peRank`.
 QMP coordination
 ----------------
 If the calling program uses the SciDAC QMP message-passing package, a
 call to QMP_comm_split() instead can be used to reassign the ranks.
 In the example below, `peGrid` gives the processor-grid dimensions,
 usually set on the command line with `-qmp-geom`.
 **Example**::
  int NDIM = QMP_get_allocated_number_of_dimensions();
  Grid::Grid_init(argc,argv);
  FgridBase::grid_initted=true;
  std::vector<int> processors;
  for(int i=0;i<NDIM;i++) processors.push_back(peGrid[i]);
  Grid::CartesianCommunicator grid_cart(processors);
  std::vector<int> pePos(NDIM);
  for(int i=NDIM-1;i>=0;i--)
     pePos[i] = grid_cart._processor_coor[i];
  int peRank = grid_cart->RankFromProcessorCoor(pePos);
  QMP_comm_split(QMP_comm_get_default(),0,peRank,&qmp_comm);
  QMP_comm_set_default(qmp_comm);
 Mapping fields between Grid and user layouts
 ---------------------------------------------
 In order to map data between calling-program and Grid layouts, it is
 important to know how the lattice sites are distributed across the
 processor grid.  A lattice site with coordinates `r[mu]` is assigned
 to the processor with processor coordinates `pePos[mu]` according to
 the rule::
  pePos[mu] = r[mu]/dim[mu]
 where `dim[mu]` is the lattice dimension in the `mu` direction.  For
 performance reasons, it is important that the external data layout
 follow the same rule.  Then data mapping can be done without
 requiring costly communication between ranks.  We assume this is the
 case here.
 When mapping data to and from Grid, one must choose a lattice object
 defined on the appropriate grid, whether it be a full lattice (4D
 `GridCartesian`), one of the checkerboards (4D
 `GridRedBlackCartesian`), a five-dimensional full grid (5D
 `GridCartesian`), or a five-dimensional checkerboard (5D
 `GridRedBlackCartesian`).  For example, an improved staggered-fermion
 color-vector field `cv` on a single checkerboard would be constructed
 using
 **Example**::
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian       RBGrid(&Grid);
  typename ImprovedStaggeredFermion::FermionField  cv(RBGrid);
 The example above assumes that the grid default values were set in the
 `Grid_init` call.  If not, they can be set at this point and passed
 when `GridCartesian` is instantiated here.  To map data within an MPI
 rank, the external code must iterate over the sites belonging to that
 rank (full or checkerboard as appropriate).  Note that the site
 coordinates are specified relative to the origin of the lattice
 subvolume on that rank. To import data into Grid, the external data on
 a single site with coordinates `r` is first copied into the
 appropriate Grid scalar object `s`.  Then it is copied into the Grid
 lattice field `l` with `pokeLocalSite`::
  pokeLocalSite(const sobj &s, Lattice<vobj> &l, Coordinate &r);
 To export data from Grid, the reverse operation starts with::
  peekLocalSite(const sobj &s, Lattice<vobj> &l, Coordinate &r);
 and then copies the single-site data from `s` into the corresponding
 external type.
 Here is an example that maps a single site's worth of data in a MILC
 color-vector field to a Grid scalar ColourVector object `cVec` and from
 there to the lattice colour-vector field `cv`, as defined above.
 **Example**::
  indexToCoords(idx,r);
  ColourVector cVec;
  for(int col=0; col<Nc; col++)
      cVec()()(col) = 
          Complex(src[idx].c[col].real, src[idx].c[col].imag);
  pokeLocalSite(cVec, cv, r);
 Here the `indexToCoords()` function is a MILC mapping of the MILC site
 index `idx` to the 4D lattice coordinate `r`.
 Grid provides block- and multiple-rhs conjugate-gradient solvers. For
 this purpose it uses a 5D lattice. To map data to and from Grid data
 types, the index for the right-hand-side vector becomes the zeroth
 coordinate of a five-dimensional vector `r5`.  The remaining
 components of `r5` contain the 4D space-time coordinates.  The
 `pokeLocalSite/peekLocalSite` operations then accept the coordinate
 `r5`, provided the destination/source lattice object is also 5D.  In
 the example below data from a single site specified by `idx`,
 belonging to a set of `Ls` MILC color-vector fields, are copied into a
 Grid 5D fermion field `cv5`.
 **Example**::
    GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt();
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid)  typename ImprovedStaggeredFermion5D::FermionField  cv5(FrbGrid);
    std::vector<int> r(4);
    indexToCoords(idx,r);
    std::vector<int> r5(1,0);
    for( int d = 0; d < 4; d++ ) r5.push_back(r[d]);
    for( int j = 0; j < Ls; j++ ){
      r5[0] = j;
      ColourVector cVec;
      for(int col=0; col<Nc; col++){
 	  cVec()()(col) = 
 	      Complex(src[j][idx].c[col].real, src[j][idx].c[col].imag);
      }
      pokeLocalSite(cVec, *(out->cv), r5);
    }
--- a/documentation/manual.rst
+++ b/documentation/manual.rst
--- a/tests/hmc/Test_hmc_EODWFRatio.cc
+++ b/tests/hmc/Test_hmc_EODWFRatio.cc
@@ -146,13 +146,7 @@ int main(int argc, char **argv) {
  std::cout << GridLogMessage << "Denominator report, Dw(m) term (includes CG) : " << std::endl;
  DenOp.Report();
  Grid_finalize();
 } // main