Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2026-02-13 18:30:54 +00:00 · 2018-10-08 10:12:11 +01:00
parent d9de8fd5c9 148fc052bd
commit dac9f8622e
84 changed files with 12100 additions and 4646 deletions
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -1,4 +1,9 @@
 #pragma once
+// Force Eigen to use MKL if Grid has been configured with --enable-mkl
+#ifdef USE_MKL
+#define EIGEN_USE_MKL_ALL
+#endif
+
 #if defined __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
--- a/Grid/json/json.hpp
+++ b/Grid/json/json.hpp
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -250,8 +250,7 @@ class GridLimeReader : public BinaryIO {
  ////////////////////////////////////////////
  // Read a generic serialisable object
  ////////////////////////////////////////////
-  template<class serialisable_object>
-  void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
+  void readLimeObject(std::string &xmlstring,std::string record_name)
  {
    // should this be a do while; can we miss a first record??
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
@@ -266,15 +265,23 @@ class GridLimeReader : public BinaryIO {
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
 	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;

-  std::string xmlstring(&xmlc[0]);
-	XmlReader RD(xmlstring, true, "");
-	read(RD,object_name,object);
+   xmlstring = std::string(&xmlc[0]);
 	return;
      }

    }  
    assert(0);
  }
+
+  template<class serialisable_object>
+  void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
+  {
+    std::string xmlstring;
+
+    readLimeObject(xmlstring, record_name);
+	  XmlReader RD(xmlstring, true, "");
+	  read(RD,object_name,object);
+  }
 };

 class GridLimeWriter : public BinaryIO 
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
--- a/Grid/qcd/utils/Utils.h
+++ b/Grid/qcd/utils/Utils.h
@@ -12,4 +12,10 @@
 #include <Grid/qcd/utils/SUnAdjoint.h>
 #include <Grid/qcd/utils/SUnTwoIndex.h>

+// All-to-all contraction kernels that touch the 
+// internal lattice structure
+#include <Grid/qcd/utils/A2Autils.h>
+
+
+
 #endif
--- a/Grid/serialisation/XmlIO.cc
+++ b/Grid/serialisation/XmlIO.cc
@@ -121,17 +121,26 @@ XmlReader::XmlReader(const std::string &s,  const bool isBuffer,
  }
 }

+#define XML_SAFE_NODE(expr)\
+if (expr)\
+{\
+  node_ = expr;\
+  return true;\
+}\
+else\
+{\
+  return false;\
+}
+
 bool XmlReader::push(const std::string &s)
 {
-  if (node_.child(s.c_str()))
+  if (s.empty())
  {
-    node_ = node_.child(s.c_str());
-
-    return true;
+    XML_SAFE_NODE(node_.first_child());
  }
  else
  {
-    return false;
+    XML_SAFE_NODE(node_.child(s.c_str()));
  }
 }

@@ -142,18 +151,26 @@ void XmlReader::pop(void)

 bool XmlReader::nextElement(const std::string &s)
 {
-  if (node_.next_sibling(s.c_str()))
+  if (s.empty())
  {
-    node_ = node_.next_sibling(s.c_str());
-    
-    return true;
+    XML_SAFE_NODE(node_.next_sibling());
  }
  else
  {
-    return false;
+    XML_SAFE_NODE(node_.next_sibling(s.c_str()));
  }
 }

+void XmlReader::readCurrentSubtree(std::string &s)
+{
+  std::ostringstream oss; 
+  pugi::xml_document doc;
+
+  doc.append_copy(node_);
+  doc.save(oss, indent_.c_str(), pugi::format_default | pugi::format_no_declaration);
+  s = oss.str();
+}
+
 template <>
 void XmlReader::readDefault(const std::string &s, std::string &output)
 {
--- a/Grid/serialisation/XmlIO.h
+++ b/Grid/serialisation/XmlIO.h
@@ -72,16 +72,18 @@ namespace Grid
    XmlReader(const std::string &fileName, const bool isBuffer = false, 
              std::string toplev = std::string("grid") );
    virtual ~XmlReader(void) = default;
-    bool push(const std::string &s);
+    bool push(const std::string &s = "");
    void pop(void);
-    bool nextElement(const std::string &s);
+    bool nextElement(const std::string &s = "");
    template <typename U>
    void readDefault(const std::string &s, U &output);
    template <typename U>
    void readDefault(const std::string &s, std::vector<U> &output);
+    void readCurrentSubtree(std::string &s);
  private:
    void checkParse(const pugi::xml_parse_result &result, const std::string name);
  private:
+    const std::string  indent_{"  "};
    pugi::xml_document doc_;
    pugi::xml_node     node_;
    std::string        fileName_;
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -7,6 +7,7 @@ Source file: Hadrons/A2AMatrix.hpp
 Copyright (C) 2015-2018

 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,46 +30,136 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define A2A_Matrix_hpp_

 #include <Hadrons/Global.hpp>
+#include <Hadrons/TimerArray.hpp>
+#include <Grid/Eigen/unsupported/CXX11/Tensor>
+
+#ifndef HADRONS_A2AM_NAME 
+#define HADRONS_A2AM_NAME "a2aMatrix"
+#endif
+
+#define HADRONS_A2AM_PARALLEL_IO

 BEGIN_HADRONS_NAMESPACE

-template <typename T, typename MetadataType>
+// general A2A matrix set based on Eigen tensors and Grid-allocated memory
+// Dimensions:
+//   0 - ext - external field (momentum, EM field, ...)
+//   1 - str - spin-color structure
+//   2 - t   - timeslice
+//   3 - i   - left  A2A mode index
+//   4 - j   - right A2A mode index
+template <typename T>
+using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
+
+/******************************************************************************
+ *                      Abstract class for A2A kernels                        *
+ ******************************************************************************/
+template <typename T, typename Field>
+class A2AKernel
+{
+public:
+    A2AKernel(void) = default;
+    virtual ~A2AKernel(void) = default;
+    virtual void operator()(A2AMatrixSet<T> &m, const Field *left, const Field *right,
+                          const unsigned int orthogDim, double &time) = 0;
+    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
+    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
+};
+
+/******************************************************************************
+ *                  Class to handle A2A matrix block HDF5 I/O                 *
+ ******************************************************************************/
+template <typename T>
 class A2AMatrixIo
 {
 public:
+    // constructors
    A2AMatrixIo(void) = default;
    A2AMatrixIo(std::string filename, std::string dataname, 
                const unsigned int nt, const unsigned int ni,
-                const unsigned int nj, const unsigned int blockSize);
+                const unsigned int nj);
+    // destructor
    ~A2AMatrixIo(void) = default;
-    void initFile(const MetadataType &d);
-    void saveBlock(const T *data, const unsigned int i, const unsigned int j);
+    // file allocation
+    template <typename MetadataType>
+    void initFile(const MetadataType &d, const unsigned int chunkSize);
+    // block I/O
+    void saveBlock(const T *data, const unsigned int i, const unsigned int j,
+                   const unsigned int blockSizei, const unsigned int blockSizej);
+    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
+                   const unsigned int i, const unsigned int j);
 private:
    std::string  filename_, dataname_;
-    unsigned int nt_, ni_, nj_, blockSize_;
+    unsigned int nt_, ni_, nj_;
 };

-template <typename T, typename MetadataType>
-A2AMatrixIo<T, MetadataType>::A2AMatrixIo(std::string filename, 
-                                          std::string dataname, 
-                                          const unsigned int nt, 
-                                          const unsigned int ni,
-                                          const unsigned int nj,
-                                          const unsigned int blockSize)
+/******************************************************************************
+ *                  Wrapper for A2A matrix block computation                  *
+ ******************************************************************************/
+template <typename T, typename Field, typename MetadataType, typename TIo = T>
+class A2AMatrixBlockComputation
+{
+private:
+    struct IoHelper
+    {
+        A2AMatrixIo<TIo> io;
+        MetadataType     md;
+        unsigned int     e, s, i, j;
+    };
+    typedef std::function<std::string(const unsigned int, const unsigned int)>  FilenameFn;
+    typedef std::function<MetadataType(const unsigned int, const unsigned int)> MetadataFn;
+public:
+    // constructor
+    A2AMatrixBlockComputation(GridBase *grid,
+                              const unsigned int orthogDim,
+                              const unsigned int next,
+                              const unsigned int nstr,
+                              const unsigned int blockSize,
+                              const unsigned int cacheBlockSize,
+                              TimerArray *tArray = nullptr);
+    // execution
+    void execute(const std::vector<Field> &left, 
+                 const std::vector<Field> &right,
+                 A2AKernel<T, Field> &kernel,
+                 const FilenameFn &ionameFn,
+                 const FilenameFn &filenameFn,
+                 const MetadataFn &metadataFn);
+private:
+    // I/O handler
+    void saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h);
+private:
+    TimerArray            *tArray_;
+    GridBase              *grid_;
+    unsigned int          orthogDim_, nt_, next_, nstr_, blockSize_, cacheBlockSize_;
+    Vector<T>             mCache_;
+    Vector<TIo>           mBuf_;
+    std::vector<IoHelper> nodeIo_;
+};
+
+/******************************************************************************
+ *                     A2AMatrixIo template implementation                    *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename T>
+A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname, 
+                            const unsigned int nt, const unsigned int ni,
+                            const unsigned int nj)
 : filename_(filename), dataname_(dataname)
-, nt_(nt), ni_(ni), nj_(nj), blockSize_(blockSize)
+, nt_(nt), ni_(ni), nj_(nj)
 {}

-template <typename T, typename MetadataType>
-void A2AMatrixIo<T, MetadataType>::initFile(const MetadataType &d)
+// file allocation /////////////////////////////////////////////////////////////
+template <typename T>
+template <typename MetadataType>
+void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSize)
 {
 #ifdef HAVE_HDF5
    std::vector<hsize_t>    dim = {static_cast<hsize_t>(nt_), 
                                   static_cast<hsize_t>(ni_), 
                                   static_cast<hsize_t>(nj_)},
                            chunk = {static_cast<hsize_t>(nt_), 
-                                     static_cast<hsize_t>(blockSize_), 
-                                     static_cast<hsize_t>(blockSize_)};
+                                     static_cast<hsize_t>(chunkSize), 
+                                     static_cast<hsize_t>(chunkSize)};
    H5NS::DataSpace         dataspace(dim.size(), dim.data());
    H5NS::DataSet           dataset;
    H5NS::DSetCreatPropList plist;
@@ -85,20 +176,23 @@ void A2AMatrixIo<T, MetadataType>::initFile(const MetadataType &d)
    push(reader, dataname_);
    auto &group = reader.getGroup();
    plist.setChunk(chunk.size(), chunk.data());
-    dataset = group.createDataSet("data", Hdf5Type<T>::type(), dataspace, plist);
+    dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type<T>::type(), dataspace, plist);
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }

-template <typename T, typename MetadataType>
-void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data, 
+// block I/O ///////////////////////////////////////////////////////////////////
+template <typename T>
+void A2AMatrixIo<T>::saveBlock(const T *data, 
                               const unsigned int i, 
-                                             const unsigned int j)
+                               const unsigned int j,
+                               const unsigned int blockSizei,
+                               const unsigned int blockSizej)
 {
 #ifdef HAVE_HDF5
    Hdf5Reader           reader(filename_);
-    std::vector<hsize_t> count = {nt_, blockSize_, blockSize_},
+    std::vector<hsize_t> count = {nt_, blockSizei, blockSizej},
                         offset = {0, static_cast<hsize_t>(i),
                                   static_cast<hsize_t>(j)},
                         stride = {1, 1, 1},
@@ -109,7 +203,7 @@ void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data,

    push(reader, dataname_);
    auto &group = reader.getGroup();
-    dataset     = group.openDataSet("data");
+    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
    dataspace   = dataset.getSpace();
    dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
                              stride.data(), block.data());
@@ -119,6 +213,193 @@ void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data,
 #endif
 }

+template <typename T>
+void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
+                               const unsigned int ext, const unsigned int str,
+                               const unsigned int i, const unsigned int j)
+{
+    unsigned int blockSizei = m.dimension(3);
+    unsigned int blockSizej = m.dimension(4);
+    unsigned int nstr       = m.dimension(1);
+    size_t       offset     = (ext*nstr + str)*nt_*blockSizei*blockSizej;
+
+    saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
+}
+
+/******************************************************************************
+ *               A2AMatrixBlockComputation template implementation            *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename T, typename Field, typename MetadataType, typename TIo>
+A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
+::A2AMatrixBlockComputation(GridBase *grid,
+                            const unsigned int orthogDim,
+                            const unsigned int next, 
+                            const unsigned int nstr,
+                            const unsigned int blockSize, 
+                            const unsigned int cacheBlockSize,
+                            TimerArray *tArray)
+: grid_(grid), nt_(grid->GlobalDimensions()[orthogDim]), orthogDim_(orthogDim)
+, next_(next), nstr_(nstr), blockSize_(blockSize), cacheBlockSize_(cacheBlockSize)
+, tArray_(tArray)
+{
+    mCache_.resize(nt_*next_*nstr_*cacheBlockSize_*cacheBlockSize_);
+    mBuf_.resize(nt_*next_*nstr_*blockSize_*blockSize_);
+}
+
+#define START_TIMER(name) if (tArray_) tArray_->startTimer(name)
+#define STOP_TIMER(name)  if (tArray_) tArray_->stopTimer(name)
+#define GET_TIMER(name)   ((tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.)
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename T, typename Field, typename MetadataType, typename TIo>
+void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
+::execute(const std::vector<Field> &left, const std::vector<Field> &right,
+          A2AKernel<T, Field> &kernel, const FilenameFn &ionameFn,
+          const FilenameFn &filenameFn, const MetadataFn &metadataFn)
+{
+    //////////////////////////////////////////////////////////////////////////
+    // i,j   is first  loop over blockSize_ factors
+    // ii,jj is second loop over cacheBlockSize_ factors for high perf contractions
+    // iii,jjj are loops within cacheBlock
+    // Total index is sum of these  i+ii+iii etc...
+    //////////////////////////////////////////////////////////////////////////
+    int    N_i = left.size();
+    int    N_j = right.size();
+    double flops, bytes, t_kernel;
+    double nodes = grid_->NodeCount();
+    
+    int NBlock_i = N_i/blockSize_ + (((N_i % blockSize_) != 0) ? 1 : 0);
+    int NBlock_j = N_j/blockSize_ + (((N_j % blockSize_) != 0) ? 1 : 0);
+
+    for(int i=0;i<N_i;i+=blockSize_)
+    for(int j=0;j<N_j;j+=blockSize_)
+    {
+        // Get the W and V vectors for this block^2 set of terms
+        int N_ii = MIN(N_i-i,blockSize_);
+        int N_jj = MIN(N_j-j,blockSize_);
+        A2AMatrixSet<TIo> mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj);
+
+        LOG(Message) << "All-to-all matrix block " 
+                     << j/blockSize_ + NBlock_j*i/blockSize_ + 1 
+                     << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
+                     << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
+                     << std::endl;
+        // Series of cache blocked chunks of the contractions within this block
+        flops    = 0.0;
+        bytes    = 0.0;
+        t_kernel = 0.0;
+        for(int ii=0;ii<N_ii;ii+=cacheBlockSize_)
+        for(int jj=0;jj<N_jj;jj+=cacheBlockSize_)
+        {
+            double t;
+            int N_iii = MIN(N_ii-ii,cacheBlockSize_);
+            int N_jjj = MIN(N_jj-jj,cacheBlockSize_);
+            A2AMatrixSet<T> mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj);
+
+            START_TIMER("kernel");
+            kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, t);
+            STOP_TIMER("kernel");
+            t_kernel += t;
+            flops    += kernel.flops(N_iii, N_jjj);
+            bytes    += kernel.bytes(N_iii, N_jjj);
+
+            START_TIMER("cache copy");
+            parallel_for_nest5(int e =0;e<next_;e++)
+            for(int s =0;s< nstr_;s++)
+            for(int t =0;t< nt_;t++)
+            for(int iii=0;iii< N_iii;iii++)
+            for(int jjj=0;jjj< N_jjj;jjj++)
+            {
+                mBlock(e,s,t,ii+iii,jj+jjj) = mCacheBlock(e,s,t,iii,jjj);
+            }
+            STOP_TIMER("cache copy");
+        }
+
+        // perf
+        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
+                     << " Gflop/s/node " << std::endl;
+        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
+                     << " GB/s/node "  << std::endl;
+
+        // IO
+        double       blockSize, ioTime;
+        unsigned int myRank = grid_->ThisRank(), nRank  = grid_->RankCount();
+    
+        LOG(Message) << "Writing block to disk" << std::endl;
+        ioTime = -GET_TIMER("IO: write block");
+        START_TIMER("IO: total");
+        makeFileDir(filenameFn(0, 0), grid_);
+#ifdef HADRONS_A2AM_PARALLEL_IO
+        grid_->Barrier();
+        // make task list for current node
+        nodeIo_.clear();
+        for(int f = myRank; f < next_*nstr_; f += nRank)
+        {
+            IoHelper h;
+
+            h.i  = i;
+            h.j  = j;
+            h.e  = f/nstr_;
+            h.s  = f % nstr_;
+            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
+                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
+            h.md = metadataFn(h.e, h.s);
+            nodeIo_.push_back(h);
+        }
+        // parallel IO
+        for (auto &h: nodeIo_)
+        {
+            saveBlock(mBlock, h);
+        }
+        grid_->Barrier();
+#else
+        // serial IO, for testing purposes only
+        for(int e = 0; e < next_; e++)
+        for(int s = 0; s < nstr_; s++)
+        {
+            IoHelper h;
+
+            h.i  = i;
+            h.j  = j;
+            h.e  = e;
+            h.s  = s;
+            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
+                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
+            h.md = metadataFn(h.e, h.s);
+            saveBlock(mfBlock, h);
+        }
+#endif
+        STOP_TIMER("IO: total");
+        blockSize  = static_cast<double>(next_*nstr_*nt_*N_ii*N_jj*sizeof(TIo));
+        ioTime    += GET_TIMER("IO: write block");
+        LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
+                     << ioTime  << " us (" 
+                     << blockSize/ioTime*1.0e6/1024/1024
+                     << " MB/s)" << std::endl;
+    }
+}
+
+// I/O handler /////////////////////////////////////////////////////////////////
+template <typename T, typename Field, typename MetadataType, typename TIo>
+void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
+::saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h)
+{
+    if ((h.i == 0) and (h.j == 0))
+    {
+        START_TIMER("IO: file creation");
+        h.io.initFile(h.md, blockSize_);
+        STOP_TIMER("IO: file creation");
+    }
+    START_TIMER("IO: write block");
+    h.io.saveBlock(m, h.e, h.s, h.i, h.j);
+    STOP_TIMER("IO: write block");
+}
+
+#undef START_TIMER
+#undef STOP_TIMER
+#undef GET_TIMER
+
 END_HADRONS_NAMESPACE

 #endif // A2A_Matrix_hpp_
--- a/Hadrons/EigenPack.hpp
+++ b/Hadrons/EigenPack.hpp
@@ -39,230 +39,328 @@ BEGIN_HADRONS_NAMESPACE
 #define HADRONS_DEFAULT_LANCZOS_NBASIS 60
 #endif

-template <typename F>
-class EigenPack
+#define HADRONS_DUMP_EP_METADATA(record) \
+LOG(Message) << "Eigenpack metadata:" << std::endl;\
+LOG(Message) << "* operator" << std::endl;\
+LOG(Message) << (record).operatorXml << std::endl;\
+LOG(Message) << "* solver" << std::endl;\
+LOG(Message) << (record).solverXml << std::endl;
+
+struct PackRecord
 {
-public:
-    typedef F Field;
-    struct PackRecord
-    {
    std::string operatorXml, solverXml;
-    };
-    struct VecRecord: Serializable
-    {
+};
+
+struct VecRecord: Serializable
+{
    GRID_SERIALIZABLE_CLASS_MEMBERS(VecRecord,
                                    unsigned int, index,
                                    double,       eval);
    VecRecord(void): index(0), eval(0.) {}
-    };
+};
+
+namespace EigenPackIo
+{
+    inline void readHeader(PackRecord &record, ScidacReader &binReader)
+    {
+        std::string recordXml;
+
+        binReader.readLimeObject(recordXml, SCIDAC_FILE_XML);
+        XmlReader xmlReader(recordXml, true, "eigenPackPar");
+        xmlReader.push();
+        xmlReader.readCurrentSubtree(record.operatorXml);
+        xmlReader.nextElement();
+        xmlReader.readCurrentSubtree(record.solverXml);
+    }
+
+    template <typename T, typename TIo = T>
+    void readElement(T &evec, RealD &eval, const unsigned int index,
+                     ScidacReader &binReader, TIo *ioBuf = nullptr)
+    {
+        VecRecord vecRecord;
+
+        LOG(Message) << "Reading eigenvector " << index << std::endl;
+        if (ioBuf == nullptr)
+        {
+            binReader.readScidacFieldRecord(evec, vecRecord);
+        }
+        else
+        {
+            binReader.readScidacFieldRecord(*ioBuf, vecRecord);
+            precisionChange(evec, *ioBuf);
+        }
+        if (vecRecord.index != index)
+        {
+            HADRONS_ERROR(Io, "Eigenvector " + std::to_string(index) + " has a"
+                            + " wrong index (expected " + std::to_string(vecRecord.index) 
+                            + ")");
+        }
+        eval = vecRecord.eval;
+    }
+
+    template <typename T, typename TIo = T>
+    static void readPack(std::vector<T> &evec, std::vector<RealD> &eval,
+                         PackRecord &record, const std::string filename, 
+                         const unsigned int size, bool multiFile, 
+                         GridBase *gridIo = nullptr)
+    {
+        std::unique_ptr<TIo> ioBuf{nullptr};
+        ScidacReader         binReader;
+
+        if (typeHash<T>() != typeHash<TIo>())
+        {
+            if (gridIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "I/O type different from vector type but null I/O grid passed");
+            }
+            ioBuf.reset(new TIo(gridIo));
+        }
+        if (multiFile)
+        {
+            std::string fullFilename;
+
+            for(int k = 0; k < size; ++k) 
+            {
+                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
+                binReader.open(fullFilename);
+                readHeader(record, binReader);
+                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
+                binReader.close();
+            }
+        }
+        else
+        {
+            binReader.open(filename);
+            readHeader(record, binReader);
+            for(int k = 0; k < size; ++k) 
+            {
+                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
+            }
+            binReader.close();
+        }
+    }
+
+    inline void writeHeader(ScidacWriter &binWriter, PackRecord &record)
+    {
+        XmlWriter xmlWriter("", "eigenPackPar");
+
+        xmlWriter.pushXmlString(record.operatorXml);
+        xmlWriter.pushXmlString(record.solverXml);
+        binWriter.writeLimeObject(1, 1, xmlWriter, "parameters", SCIDAC_FILE_XML);
+    }
+
+    template <typename T, typename TIo = T>
+    void writeElement(ScidacWriter &binWriter, T &evec, RealD &eval, 
+                      const unsigned int index, TIo *ioBuf, 
+                      T *testBuf = nullptr)
+    {
+        VecRecord vecRecord;
+
+        LOG(Message) << "Writing eigenvector " << index << std::endl;
+        vecRecord.eval  = eval;
+        vecRecord.index = index;
+        if ((ioBuf == nullptr) || (testBuf == nullptr))
+        {
+            binWriter.writeScidacFieldRecord(evec, vecRecord, DEFAULT_ASCII_PREC);
+        }
+        else
+        {
+            precisionChange(*ioBuf, evec);
+            precisionChange(*testBuf, *ioBuf);
+            *testBuf -= evec;
+            LOG(Message) << "Precision diff norm^2 " << norm2(*testBuf) << std::endl;
+            binWriter.writeScidacFieldRecord(*ioBuf, vecRecord, DEFAULT_ASCII_PREC);
+        }   
+    }
+    
+    template <typename T, typename TIo = T>
+    static void writePack(const std::string filename, std::vector<T> &evec, 
+                          std::vector<RealD> &eval, PackRecord &record, 
+                          const unsigned int size, bool multiFile, 
+                          GridBase *gridIo = nullptr)
+    {
+        GridBase             *grid = evec[0]._grid;
+        std::unique_ptr<TIo> ioBuf{nullptr}; 
+        std::unique_ptr<T>   testBuf{nullptr};
+        ScidacWriter         binWriter(grid->IsBoss());
+
+        if (typeHash<T>() != typeHash<TIo>())
+        {
+            if (gridIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "I/O type different from vector type but null I/O grid passed");
+            }
+            ioBuf.reset(new TIo(gridIo));
+            testBuf.reset(new T(grid));
+        }
+        if (multiFile)
+        {
+            std::string fullFilename;
+
+            for(int k = 0; k < size; ++k) 
+            {
+                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
+
+                makeFileDir(fullFilename, grid);
+                binWriter.open(fullFilename);
+                writeHeader(binWriter, record);
+                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
+                binWriter.close();
+            }
+        }
+        else
+        {
+            makeFileDir(filename, grid);
+            binWriter.open(filename);
+            writeHeader(binWriter, record);
+            for(int k = 0; k < size; ++k) 
+            {
+                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
+            }
+            binWriter.close();
+        }
+    }
+}
+
+template <typename F>
+class BaseEigenPack
+{
+public:
+    typedef F Field;
 public:
    std::vector<RealD> eval;
    std::vector<F>     evec;
    PackRecord         record;
 public:
-    EigenPack(void)          = default;
-    virtual ~EigenPack(void) = default;
-
-    EigenPack(const size_t size, GridBase *grid)
+    BaseEigenPack(void)          = default;
+    BaseEigenPack(const size_t size, GridBase *grid)
    {
        resize(size, grid);
    }
-
+    virtual ~BaseEigenPack(void) = default;
    void resize(const size_t size, GridBase *grid)
    {
        eval.resize(size);
        evec.resize(size, grid);
    }
+};
+
+template <typename F, typename FIo = F>
+class EigenPack: public BaseEigenPack<F>
+{
+public:
+    typedef F   Field;
+    typedef FIo FieldIo;
+public:
+    EigenPack(void)          = default;
+    virtual ~EigenPack(void) = default;
+
+    EigenPack(const size_t size, GridBase *grid, GridBase *gridIo = nullptr)
+    : BaseEigenPack<F>(size, grid)
+    {
+        if (typeHash<F>() != typeHash<FIo>())
+        {
+            if (gridIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "I/O type different from vector type but null I/O grid passed");
+            }
+        }
+        gridIo_ = gridIo;
+    }

    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
-        {
-            for(int k = 0; k < evec.size(); ++k)
-            {
-                basicReadSingle(evec[k], eval[k], evecFilename(fileStem, k, traj), k);
-            }
-        }
-        else
-        {
-            basicRead(evec, eval, evecFilename(fileStem, -1, traj), evec.size());
-        }
+        EigenPackIo::readPack<F, FIo>(this->evec, this->eval, this->record, 
+                                      evecFilename(fileStem, traj, multiFile), 
+                                      this->evec.size(), multiFile, gridIo_);
+        HADRONS_DUMP_EP_METADATA(this->record);
    }

    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
-        {
-            for(int k = 0; k < evec.size(); ++k)
-            {
-                basicWriteSingle(evecFilename(fileStem, k, traj), evec[k], eval[k], k);
-            }
-        }
-        else
-        {
-            basicWrite(evecFilename(fileStem, -1, traj), evec, eval, evec.size());
-        }
+        EigenPackIo::writePack<F, FIo>(evecFilename(fileStem, traj, multiFile), 
+                                       this->evec, this->eval, this->record, 
+                                       this->evec.size(), multiFile, gridIo_);
    }
 protected:
-    std::string evecFilename(const std::string stem, const int vec, const int traj)
+    std::string evecFilename(const std::string stem, const int traj, const bool multiFile)
    {
        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));

-        if (vec == -1)
+        if (multiFile)
        {
-            return stem + t + ".bin";
+            return stem + t;
        }
        else
        {
-            return stem + t + "/v" + std::to_string(vec) + ".bin";
+            return stem + t + ".bin";
        }
    }
-
-    template <typename T>
-    void basicRead(std::vector<T> &evec, std::vector<RealD> &eval,
-                   const std::string filename, const unsigned int size)
-    {
-        ScidacReader    binReader;
-
-        binReader.open(filename);
-        binReader.skipPastObjectRecord(SCIDAC_FILE_XML);
-        for(int k = 0; k < size; ++k) 
-        {
-            VecRecord vecRecord;
-
-            LOG(Message) << "Reading eigenvector " << k << std::endl;
-            binReader.readScidacFieldRecord(evec[k], vecRecord);
-            if (vecRecord.index != k)
-            {
-                HADRONS_ERROR(Io, "Eigenvector " + std::to_string(k) + " has a"
-                              + " wrong index (expected " + std::to_string(vecRecord.index) 
-                              + ") in file '" + filename + "'");
-            }
-            eval[k] = vecRecord.eval;
-        }
-        binReader.close();
-    }
-
-    template <typename T>
-    void basicReadSingle(T &evec, RealD &eval, const std::string filename, 
-                         const unsigned int index)
-    {
-        ScidacReader binReader;
-        VecRecord    vecRecord;
-
-        binReader.open(filename);
-        binReader.skipPastObjectRecord(SCIDAC_FILE_XML);
-        LOG(Message) << "Reading eigenvector " << index << std::endl;
-        binReader.readScidacFieldRecord(evec, vecRecord);
-        if (vecRecord.index != index)
-        {
-            HADRONS_ERROR(Io, "Eigenvector " + std::to_string(index) + " has a"
-                          + " wrong index (expected " + std::to_string(vecRecord.index) 
-                          + ") in file '" + filename + "'");
-        }
-        eval = vecRecord.eval;
-        binReader.close();
-    }
-
-    template <typename T>
-    void basicWrite(const std::string filename, std::vector<T> &evec, 
-                    const std::vector<RealD> &eval, const unsigned int size)
-    {
-        ScidacWriter binWriter(evec[0]._grid->IsBoss());
-        XmlWriter    xmlWriter("", "eigenPackPar");
-
-        makeFileDir(filename, evec[0]._grid);
-        xmlWriter.pushXmlString(record.operatorXml);
-        xmlWriter.pushXmlString(record.solverXml);
-        binWriter.open(filename);
-        binWriter.writeLimeObject(1, 1, xmlWriter, "parameters", SCIDAC_FILE_XML);
-        for(int k = 0; k < size; ++k) 
-        {
-            VecRecord vecRecord;
-
-            vecRecord.index = k;
-            vecRecord.eval  = eval[k];
-            LOG(Message) << "Writing eigenvector " << k << std::endl;
-            binWriter.writeScidacFieldRecord(evec[k], vecRecord, DEFAULT_ASCII_PREC);
-        }
-        binWriter.close();
-    }
-
-    template <typename T>
-    void basicWriteSingle(const std::string filename, T &evec, 
-                          const RealD eval, const unsigned int index)
-    {
-        ScidacWriter binWriter(evec._grid->IsBoss());
-        XmlWriter    xmlWriter("", "eigenPackPar");
-        VecRecord    vecRecord;
-
-        makeFileDir(filename, evec._grid);
-        xmlWriter.pushXmlString(record.operatorXml);
-        xmlWriter.pushXmlString(record.solverXml);
-        binWriter.open(filename);
-        binWriter.writeLimeObject(1, 1, xmlWriter, "parameters", SCIDAC_FILE_XML);
-        vecRecord.index = index;
-        vecRecord.eval  = eval;
-        LOG(Message) << "Writing eigenvector " << index << std::endl;
-        binWriter.writeScidacFieldRecord(evec, vecRecord, DEFAULT_ASCII_PREC);
-        binWriter.close();
-    }
+protected:
+    GridBase *gridIo_;
 };

-template <typename FineF, typename CoarseF>
-class CoarseEigenPack: public EigenPack<FineF>
+template <typename FineF, typename CoarseF, 
+          typename FineFIo = FineF, typename CoarseFIo = CoarseF>
+class CoarseEigenPack: public EigenPack<FineF, FineFIo>
 {
 public:
    typedef CoarseF CoarseField;         
-public:
-    std::vector<RealD>   evalCoarse;
    std::vector<CoarseF> evecCoarse;
+    std::vector<RealD>   evalCoarse;
 public:
    CoarseEigenPack(void)          = default;
    virtual ~CoarseEigenPack(void) = default;

    CoarseEigenPack(const size_t sizeFine, const size_t sizeCoarse, 
-                    GridBase *gridFine, GridBase *gridCoarse)
+                    GridBase *gridFine, GridBase *gridCoarse,
+                    GridBase *gridFineIo = nullptr, 
+                    GridBase *gridCoarseIo = nullptr)
    {
+        if (typeHash<FineF>() != typeHash<FineFIo>())
+        {
+            if (gridFineIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "Fine I/O type different from vector type but null fine I/O grid passed");
+            }
+        }
+        if (typeHash<CoarseF>() != typeHash<CoarseFIo>())
+        {
+            if (gridCoarseIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "Coarse I/O type different from vector type but null coarse I/O grid passed");
+            }
+        }
+        this->gridIo_ = gridFineIo;
+        gridCoarseIo_ = gridCoarseIo;
        resize(sizeFine, sizeCoarse, gridFine, gridCoarse);
    }

    void resize(const size_t sizeFine, const size_t sizeCoarse, 
                GridBase *gridFine, GridBase *gridCoarse)
    {
-        EigenPack<FineF>::resize(sizeFine, gridFine);
+        EigenPack<FineF, FineFIo>::resize(sizeFine, gridFine);
        evalCoarse.resize(sizeCoarse);
        evecCoarse.resize(sizeCoarse, gridCoarse);
    }

    void readFine(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
-        {
-            for(int k = 0; k < this->evec.size(); ++k)
-            {
-                this->basicReadSingle(this->evec[k], this->eval[k], this->evecFilename(fileStem + "_fine", k, traj), k);
-            }
-        }
-        else
-        {
-            this->basicRead(this->evec, this->eval, this->evecFilename(fileStem + "_fine", -1, traj), this->evec.size());
-        }
+        EigenPack<FineF, FineFIo>::read(fileStem + "_fine", multiFile, traj);
    }

    void readCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
-        {
-            for(int k = 0; k < evecCoarse.size(); ++k)
-            {
-                this->basicReadSingle(evecCoarse[k], evalCoarse[k], this->evecFilename(fileStem + "_coarse", k, traj), k);
-            }
-        }
-        else
-        {
-            this->basicRead(evecCoarse, evalCoarse, this->evecFilename(fileStem + "_coarse", -1, traj), evecCoarse.size());
-        }
+        PackRecord dummy;
+
+        EigenPackIo::readPack<CoarseF, CoarseFIo>(evecCoarse, evalCoarse, dummy, 
+                              this->evecFilename(fileStem + "_coarse", traj, multiFile), 
+                              evecCoarse.size(), multiFile, gridCoarseIo_);
    }

    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
@@ -273,32 +371,14 @@ public:

    void writeFine(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
-        {
-            for(int k = 0; k < this->evec.size(); ++k)
-            {
-                this->basicWriteSingle(this->evecFilename(fileStem + "_fine", k, traj), this->evec[k], this->eval[k], k);
-            }
-        }
-        else
-        {
-            this->basicWrite(this->evecFilename(fileStem + "_fine", -1, traj), this->evec, this->eval, this->evec.size());
-        }
+        EigenPack<FineF, FineFIo>::write(fileStem + "_fine", multiFile, traj);
    }

    void writeCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
-        if (multiFile)
-        {
-            for(int k = 0; k < evecCoarse.size(); ++k)
-            {
-                this->basicWriteSingle(this->evecFilename(fileStem + "_coarse", k, traj), evecCoarse[k], evalCoarse[k], k);
-            }
-        }
-        else
-        {
-            this->basicWrite(this->evecFilename(fileStem + "_coarse", -1, traj), evecCoarse, evalCoarse, evecCoarse.size());
-        }
+        EigenPackIo::writePack<CoarseF, CoarseFIo>(this->evecFilename(fileStem + "_coarse", traj, multiFile), 
+                                                   evecCoarse, evalCoarse, this->record, 
+                                                   evecCoarse.size(), multiFile, gridCoarseIo_);
    }
    
    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
@@ -306,18 +386,29 @@ public:
        writeFine(fileStem, multiFile, traj);
        writeCoarse(fileStem, multiFile, traj);
    }
+private:
+    GridBase *gridCoarseIo_;
 };

 template <typename FImpl>
-using FermionEigenPack = EigenPack<typename FImpl::FermionField>;
+using BaseFermionEigenPack = BaseEigenPack<typename FImpl::FermionField>;

-template <typename FImpl, int nBasis>
+template <typename FImpl, typename FImplIo = FImpl>
+using FermionEigenPack = EigenPack<typename FImpl::FermionField, typename FImplIo::FermionField>;
+
+template <typename FImpl, int nBasis, typename FImplIo = FImpl>
 using CoarseFermionEigenPack = CoarseEigenPack<
    typename FImpl::FermionField,
    typename LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
                                   typename FImpl::SiteComplex, 
+                                   nBasis>::CoarseField,
+    typename FImplIo::FermionField,
+    typename LocalCoherenceLanczos<typename FImplIo::SiteSpinor, 
+                                   typename FImplIo::SiteComplex, 
                                   nBasis>::CoarseField>;

+#undef HADRONS_DUMP_EP_METADATA
+
 END_HADRONS_NAMESPACE

 #endif // Hadrons_EigenPack_hpp_
--- a/Hadrons/Environment.cc
+++ b/Hadrons/Environment.cc
@@ -45,19 +45,13 @@ Environment::Environment(void)
 {
    dim_         = GridDefaultLatt();
    nd_          = dim_.size();
-    defaultGrid_ = {typeHash<vComplex>(), 1};
-    grid4d_[defaultGrid_].reset(
-        SpaceTimeGrid::makeFourDimGrid(dim_, 
-                                       GridDefaultSimd(nd_, vComplex::Nsimd()),
-                                       GridDefaultMpi()));
-    gridRb4d_[defaultGrid_].reset(
-        SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_[defaultGrid_].get()));
+    createGrid<vComplex>(1);
    vol_ = 1.;
    for (auto d: dim_)
    {
        vol_ *= d;
    }
-    rng4d_.reset(new GridParallelRNG(grid4d_[defaultGrid_].get()));
+    rng4d_.reset(new GridParallelRNG(getGrid()));
 }

 // grids ///////////////////////////////////////////////////////////////////////
--- a/Hadrons/Environment.hpp
+++ b/Hadrons/Environment.hpp
@@ -94,20 +94,20 @@ public:
    void                    createGrid(const unsigned int Ls);
    template <typename VType = vComplex>
    void                    createCoarseGrid(const std::vector<int> &blockSize,
-                                             const unsigned int Ls = 1);
+                                             const unsigned int Ls);
    template <typename VType = vComplex>
-    GridCartesian *         getGrid(void) const;
+    GridCartesian *         getGrid(void);
    template <typename VType = vComplex>
-    GridRedBlackCartesian * getRbGrid(void) const;
+    GridRedBlackCartesian * getRbGrid(void);
    template <typename VType = vComplex>
-    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize) const;
+    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize);
    template <typename VType = vComplex>
-    GridCartesian *         getGrid(const unsigned int Ls) const;
+    GridCartesian *         getGrid(const unsigned int Ls);
    template <typename VType = vComplex>
-    GridRedBlackCartesian * getRbGrid(const unsigned int Ls) const;
+    GridRedBlackCartesian * getRbGrid(const unsigned int Ls);
    template <typename VType = vComplex>
    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize,
-                                          const unsigned int Ls) const;
+                                          const unsigned int Ls);
    std::vector<int>        getDim(void) const;
    int                     getDim(const unsigned int mu) const;
    unsigned int            getNd(void) const;
@@ -174,7 +174,6 @@ private:
    bool                                protect_{true};
    // grids
    std::vector<int>                    dim_;
-    FineGridKey                         defaultGrid_;
    std::map<FineGridKey, GridPt>       grid4d_;
    std::map<FineGridKey, GridPt>       grid5d_;
    std::map<FineGridKey, GridRbPt>     gridRb4d_;
@@ -221,6 +220,14 @@ void Holder<T>::reset(T *pt)
 *                     Environment template implementation                    *
 ******************************************************************************/
 // grids ///////////////////////////////////////////////////////////////////////
+#define HADRONS_DUMP_GRID(...)\
+LOG(Debug) << "New grid " << (__VA_ARGS__) << std::endl;\
+LOG(Debug) << " - cb  : " << (__VA_ARGS__)->_isCheckerBoarded << std::endl;\
+LOG(Debug) << " - fdim: " << (__VA_ARGS__)->_fdimensions << std::endl;\
+LOG(Debug) << " - gdim: " << (__VA_ARGS__)->_gdimensions << std::endl;\
+LOG(Debug) << " - ldim: " << (__VA_ARGS__)->_ldimensions << std::endl;\
+LOG(Debug) << " - rdim: " << (__VA_ARGS__)->_rdimensions << std::endl;
+
 template <typename VType>
 void Environment::createGrid(const unsigned int Ls)
 {
@@ -232,15 +239,19 @@ void Environment::createGrid(const unsigned int Ls)
            SpaceTimeGrid::makeFourDimGrid(getDim(), 
                                        GridDefaultSimd(getNd(), VType::Nsimd()),
                                        GridDefaultMpi()));
+        HADRONS_DUMP_GRID(grid4d_[{hash, 1}].get());
        gridRb4d_[{hash, 1}].reset(
            SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_[{hash, 1}].get()));
+        HADRONS_DUMP_GRID(gridRb4d_[{hash, 1}].get());
    }
    if (grid5d_.find({hash, Ls}) == grid5d_.end())
    {
        auto g = grid4d_[{hash, 1}].get();
        
        grid5d_[{hash, Ls}].reset(SpaceTimeGrid::makeFiveDimGrid(Ls, g));
+        HADRONS_DUMP_GRID(grid5d_[{hash, Ls}].get());
        gridRb5d_[{hash, Ls}].reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, g));
+        HADRONS_DUMP_GRID(gridRb5d_[{hash, Ls}].get());
    }
 }

@@ -290,18 +301,24 @@ void Environment::createCoarseGrid(const std::vector<int> &blockSize,
        gridCoarse4d_[hkey4d].reset(
            SpaceTimeGrid::makeFourDimGrid(coarseDim, 
                GridDefaultSimd(nd, VType::Nsimd()), GridDefaultMpi()));
+        HADRONS_DUMP_GRID(gridCoarse4d_[hkey4d].get());
    }
    if (gridCoarse5d_.find(hkey5d) == gridCoarse5d_.end())
    {
        gridCoarse5d_[hkey5d].reset(
            SpaceTimeGrid::makeFiveDimGrid(cLs, gridCoarse4d_[hkey4d].get()));
+        HADRONS_DUMP_GRID(gridCoarse5d_[hkey5d].get());
    }
 }

+#undef HADRONS_DUMP_GRID
+
 template <typename VType>
-GridCartesian * Environment::getGrid(void) const
+GridCartesian * Environment::getGrid(void)
 {
-    auto it = grid4d_.find({typeHash<VType>(), 1});
+    FineGridKey key = {typeHash<VType>(), 1};
+
+    auto it = grid4d_.find(key);

    if (it != grid4d_.end())
    {
@@ -309,15 +326,17 @@ GridCartesian * Environment::getGrid(void) const
    }
    else
    {
-        HADRONS_ERROR(Definition, "no 4D grid for SIMD type '" 
-                                  + typeName<VType>() + "'");
+        createGrid<VType>(1);
+
+        return grid4d_.at(key).get();
    }
 }

 template <typename VType>
-GridRedBlackCartesian * Environment::getRbGrid(void) const
+GridRedBlackCartesian * Environment::getRbGrid(void)
 {
-    auto it = gridRb4d_.find({typeHash<VType>(), 1});
+    FineGridKey key = {typeHash<VType>(), 1};
+    auto        it  = gridRb4d_.find(key);

    if (it != gridRb4d_.end())
    {
@@ -325,34 +344,39 @@ GridRedBlackCartesian * Environment::getRbGrid(void) const
    }
    else
    {
-        HADRONS_ERROR(Definition, "no 4D red-black grid for SIMD type '" 
-                                  + typeName<VType>() + "'");
+        createGrid<VType>(1);
+
+        return gridRb4d_.at(key).get();
    }
 }

 template <typename VType>
-GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize) const
+GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize)
 {
    std::vector<int> s = blockSize;

    s.resize(getNd());
-    auto it = gridCoarse4d_.find({typeHash<VType>(), s});
+
+    CoarseGridKey key = {typeHash<VType>(), s};
+    auto          it  = gridCoarse4d_.find(key);
+
    if (it != gridCoarse4d_.end())
    {
        return it->second.get();
    }
    else
    {
-        HADRONS_ERROR(Definition, "no 4D coarse grid for SIMD type '" 
-                                  + typeName<VType>() + "' and block size "
-                                  + vecToStr(blockSize));
+        createCoarseGrid<VType>(blockSize, 1);
+        
+        return gridCoarse4d_.at(key).get();
    }
 }

 template <typename VType>
-GridCartesian * Environment::getGrid(const unsigned int Ls) const
+GridCartesian * Environment::getGrid(const unsigned int Ls)
 {
-    auto it = grid5d_.find({typeHash<VType>(), Ls});
+    FineGridKey key = {typeHash<VType>(), Ls};
+    auto        it  = grid5d_.find(key);

    if (it != grid5d_.end())
    {
@@ -360,16 +384,17 @@ GridCartesian * Environment::getGrid(const unsigned int Ls) const
    }
    else
    {
-        HADRONS_ERROR(Definition, "no 5D grid for SIMD type '" 
-                                  + typeName<VType>() + "' and Ls = "
-                                  + std::to_string(Ls));
+        createGrid<VType>(Ls);
+
+        return grid5d_.at(key).get();
    }
 }

 template <typename VType>
-GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls) const
+GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls)
 {
-    auto it = gridRb5d_.find({typeHash<VType>(), Ls});
+    FineGridKey key = {typeHash<VType>(), Ls};
+    auto        it  = gridRb5d_.find(key);

    if (it != gridRb5d_.end())
    {
@@ -377,30 +402,32 @@ GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls) const
    }
    else
    {
-        HADRONS_ERROR(Definition, "no 5D red-black grid for SIMD type '" 
-                                  + typeName<VType>() + "' and Ls = "
-                                  + std::to_string(Ls));
+        createGrid<VType>(Ls);
+
+        return gridRb5d_.at(key).get();
    }
 }

 template <typename VType>
 GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize,
-                                           const unsigned int Ls) const
+                                           const unsigned int Ls)
 {
    std::vector<int> s = blockSize;

    s.push_back(Ls);
-    auto it = gridCoarse5d_.find({typeHash<VType>(), s});
+
+    CoarseGridKey key = {typeHash<VType>(), s};
+
+    auto it = gridCoarse5d_.find(key);
    if (it != gridCoarse5d_.end())
    {
        return it->second.get();
    }
    else
    {
-        HADRONS_ERROR(Definition, "no 5D coarse grid for SIMD type '" 
-                                  + typeName<VType>() + "', block size "
-                                  + vecToStr(blockSize)
-                                  + " and Ls = " + std::to_string(Ls));
+        createCoarseGrid<VType>(blockSize, Ls);
+
+        return gridCoarse5d_.at(key).get();
    }
 }

--- a/Hadrons/Global.hpp
+++ b/Hadrons/Global.hpp
@@ -96,19 +96,22 @@ using Grid::operator>>;
 BEGIN_HADRONS_NAMESPACE

 // type aliases
-#define FERM_TYPE_ALIASES(FImpl, suffix)\
-typedef FermionOperator<FImpl>                        FMat##suffix;            \
-typedef typename FImpl::FermionField                  FermionField##suffix;    \
-typedef typename FImpl::PropagatorField               PropagatorField##suffix; \
-typedef typename FImpl::SitePropagator::scalar_object SitePropagator##suffix;  \
+#define BASIC_TYPE_ALIASES(Impl, suffix)\
+typedef typename Impl::Field                         ScalarField##suffix;\
+typedef typename Impl::PropagatorField               PropagatorField##suffix;\
+typedef typename Impl::SitePropagator::scalar_object SitePropagator##suffix;\
 typedef std::vector<SitePropagator##suffix>          SlicedPropagator##suffix;

-#define GAUGE_TYPE_ALIASES(FImpl, suffix)\
-typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
+#define FERM_TYPE_ALIASES(FImpl, suffix)\
+BASIC_TYPE_ALIASES(FImpl, suffix);\
+typedef FermionOperator<FImpl>            FMat##suffix;\
+typedef typename FImpl::FermionField      FermionField##suffix;\
+typedef typename FImpl::GaugeField        GaugeField##suffix;\
+typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;\
+typedef typename FImpl::ComplexField      ComplexField##suffix;

-#define SCALAR_TYPE_ALIASES(SImpl, suffix)\
-typedef typename SImpl::Field ScalarField##suffix;\
-typedef typename SImpl::Field PropagatorField##suffix;
+#define GAUGE_TYPE_ALIASES(GImpl, suffix)\
+typedef typename GImpl::GaugeField GaugeField##suffix;

 #define SOLVER_TYPE_ALIASES(FImpl, suffix)\
 typedef Solver<FImpl> Solver##suffix;
@@ -117,10 +120,6 @@ typedef Solver<FImpl> Solver##suffix;
 typedef std::function<SlicedPropagator##suffix\
                      (const PropagatorField##suffix &)> SinkFn##suffix;

-#define FG_TYPE_ALIASES(FImpl, suffix)\
-FERM_TYPE_ALIASES(FImpl, suffix)\
-GAUGE_TYPE_ALIASES(FImpl, suffix)
-
 // logger
 class HadronsLogger: public Logger
 {
--- a/Hadrons/Makefile.am
+++ b/Hadrons/Makefile.am
@@ -1,5 +1,6 @@
+SUBDIRS = . Utilities
+
 lib_LIBRARIES = libHadrons.a
-bin_PROGRAMS  = HadronsXmlRun

 include modules.inc

@@ -10,6 +11,7 @@ libHadrons_a_SOURCES = \
 	Exceptions.cc      \
    Global.cc          \
    Module.cc		   \
+	TimerArray.cc      \
 	VirtualMachine.cc
 libHadrons_adir = $(includedir)/Hadrons
 nobase_libHadrons_a_HEADERS = \
@@ -30,7 +32,5 @@ nobase_libHadrons_a_HEADERS = \
 	Modules.hpp               \
 	ModuleFactory.hpp         \
 	Solver.hpp                \
+	TimerArray.hpp            \
 	VirtualMachine.hpp
-
-HadronsXmlRun_SOURCES = HadronsXmlRun.cc
-HadronsXmlRun_LDADD   = libHadrons.a -lGrid
--- a/Hadrons/Module.cc
+++ b/Hadrons/Module.cc
@@ -66,101 +66,6 @@ void ModuleBase::operator()(void)
    stopAllTimers();
 }

-// timers //////////////////////////////////////////////////////////////////////
-void ModuleBase::startTimer(const std::string &name)
-{
-    if (!name.empty())
-    {
-        timer_[name].Start();
-    }
-}
-
-GridTime ModuleBase::getTimer(const std::string &name)
-{
-    GridTime t;
-    
-    if (!name.empty())
-    {
-        try
-        {
-            bool running = timer_.at(name).isRunning();
-
-            if (running) stopTimer(name);
-            t = timer_.at(name).Elapsed();
-            if (running) startTimer(name);
-        }
-        catch (std::out_of_range &)
-        {
-            t = GridTime::zero();
-        }
-    }
-    else
-    {
-        t = GridTime::zero();
-    }
-
-    return t;
-}
-
-double ModuleBase::getDTimer(const std::string &name)
-{
-    return static_cast<double>(getTimer(name).count());
-}
-
-void ModuleBase::startCurrentTimer(const std::string &name)
-{
-    if (!name.empty())
-    {
-        stopCurrentTimer();
-        startTimer(name);
-        currentTimer_ = name;
-    }
-}
-
-void ModuleBase::stopTimer(const std::string &name)
-{
-    if (timer_.at(name).isRunning())
-    {
-        timer_.at(name).Stop();
-    }
-}
-
-void ModuleBase::stopCurrentTimer(void)
-{
-    if (!currentTimer_.empty())
-    {
-        stopTimer(currentTimer_);
-        currentTimer_ = "";
-    }
-}
-
-void ModuleBase::stopAllTimers(void)
-{
-    for (auto &t: timer_)
-    {
-        stopTimer(t.first);
-    }
-    currentTimer_ = "";
-}
-
-void ModuleBase::resetTimers(void)
-{
-    timer_.clear();
-    currentTimer_ = "";
-}
-
-std::map<std::string, GridTime> ModuleBase::getTimings(void)
-{
-    std::map<std::string, GridTime> timing;
-
-    for (auto &t: timer_)
-    {
-        timing[t.first] = t.second.Elapsed();
-    }
-
-    return timing;
-}
-
 std::string ModuleBase::makeSeedString(void)
 {
    std::string seed;
--- a/Hadrons/Module.hpp
+++ b/Hadrons/Module.hpp
@@ -30,6 +30,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define Hadrons_Module_hpp_

 #include <Hadrons/Global.hpp>
+#include <Hadrons/TimerArray.hpp>
 #include <Hadrons/VirtualMachine.hpp>

 BEGIN_HADRONS_NAMESPACE
@@ -65,7 +66,26 @@ extern template class base;\
 MODULE_REGISTER(mod, ARG(base), ns);

 #define ARG(...) __VA_ARGS__
-#define MACRO_REDIRECT(arg1, arg2, arg3, macro, ...) macro
+#define HADRONS_MACRO_REDIRECT_12(arg1, arg2, macro, ...) macro
+#define HADRONS_MACRO_REDIRECT_23(arg1, arg2, arg3, macro, ...) macro
+
+#define envGetGrid4(latticeType)\
+env().template getGrid<typename latticeType::vector_type>()
+
+#define envGetGrid5(latticeType, Ls)\
+env().template getGrid<typename latticeType::vector_type>(Ls)
+
+#define envGetGrid(...)\
+HADRONS_MACRO_REDIRECT_12(__VA_ARGS__, envGetGrid5, envGetGrid4)(__VA_ARGS__)
+
+#define envGetRbGrid4(latticeType)\
+env().template getRbGrid<typename latticeType::vector_type>()
+
+#define envGetRbGrid5(latticeType, Ls)\
+env().template getRbGrid<typename latticeType::vector_type>(Ls)
+
+#define envGetRbGrid(...)\
+HADRONS_MACRO_REDIRECT_12(__VA_ARGS__, envGetRbGrid5, envGetRbGrid4)(__VA_ARGS__)

 #define envGet(type, name)\
 *env().template getObject<type>(name)
@@ -86,38 +106,38 @@ env().template createObject<type>(name, Environment::Storage::object, Ls, __VA_A
 env().template createDerivedObject<base, type>(name, Environment::Storage::object, Ls, __VA_ARGS__)

 #define envCreateLat4(type, name)\
-envCreate(type, name, 1, env().getGrid())
+envCreate(type, name, 1, envGetGrid(type))

 #define envCreateLat5(type, name, Ls)\
-envCreate(type, name, Ls, env().getGrid(Ls))
+envCreate(type, name, Ls, envGetGrid(type, Ls))

 #define envCreateLat(...)\
-MACRO_REDIRECT(__VA_ARGS__, envCreateLat5, envCreateLat4)(__VA_ARGS__)
+HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envCreateLat5, envCreateLat4)(__VA_ARGS__)

 #define envCache(type, name, Ls, ...)\
 env().template createObject<type>(name, Environment::Storage::cache, Ls, __VA_ARGS__)

 #define envCacheLat4(type, name)\
-envCache(type, name, 1, env().getGrid())
+envCache(type, name, 1, envGetGrid(type))

 #define envCacheLat5(type, name, Ls)\
-envCache(type, name, Ls, env().getGrid(Ls))
+envCache(type, name, Ls, envGetGrid(type, Ls))

 #define envCacheLat(...)\
-MACRO_REDIRECT(__VA_ARGS__, envCacheLat5, envCacheLat4)(__VA_ARGS__)
+HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envCacheLat5, envCacheLat4)(__VA_ARGS__)

 #define envTmp(type, name, Ls, ...)\
 env().template createObject<type>(getName() + "_tmp_" + name,         \
                                  Environment::Storage::temporary, Ls, __VA_ARGS__)

 #define envTmpLat4(type, name)\
-envTmp(type, name, 1, env().getGrid())
+envTmp(type, name, 1, envGetGrid(type))

 #define envTmpLat5(type, name, Ls)\
-envTmp(type, name, Ls, env().getGrid(Ls))
+envTmp(type, name, Ls, envGetGrid(type, Ls))

 #define envTmpLat(...)\
-MACRO_REDIRECT(__VA_ARGS__, envTmpLat5, envTmpLat4)(__VA_ARGS__)
+HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envTmpLat5, envTmpLat4)(__VA_ARGS__)

 #define saveResult(ioStem, name, result)\
 if (env().getGrid()->IsBoss() and !ioStem.empty())\
@@ -133,7 +153,7 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
 *                            Module class                                    *
 ******************************************************************************/
 // base class
-class ModuleBase
+class ModuleBase: public TimerArray
 {
 public:
    // constructor
@@ -161,16 +181,6 @@ public:
    virtual void execute(void) = 0;
    // execution
    void operator()(void);
-    // timers
-    void                            startTimer(const std::string &name);
-    GridTime                        getTimer(const std::string &name);
-    double                          getDTimer(const std::string &name);
-    void                            startCurrentTimer(const std::string &name);
-    void                            stopTimer(const std::string &name);
-    void                            stopCurrentTimer(void);
-    void                            stopAllTimers(void);
-    void                            resetTimers(void);
-    std::map<std::string, GridTime> getTimings(void);
 protected:
    // environment shortcut
    DEFINE_ENV_ALIAS;
--- a/Hadrons/Modules.hpp
+++ b/Hadrons/Modules.hpp
@@ -1,33 +1,6 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules.hpp
-
-Copyright (C) 2015-2018
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
 #include <Hadrons/Modules/MContraction/Baryon.hpp>
+#include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
 #include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
-#include <Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp>
 #include <Hadrons/Modules/MContraction/Meson.hpp>
 #include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 #include <Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
@@ -57,6 +30,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
 #include <Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
+#include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
 #include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
 #include <Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
 #include <Hadrons/Modules/MUtilities/TestSeqConserved.hpp>
--- a/Hadrons/Modules/MAction/DWF.cc
+++ b/Hadrons/Modules/MAction/DWF.cc
@@ -32,4 +32,4 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TDWF<FIMPL>;
-
+template class Grid::Hadrons::MAction::TDWF<FIMPLF>;
--- a/Hadrons/Modules/MAction/DWF.hpp
+++ b/Hadrons/Modules/MAction/DWF.hpp
@@ -56,7 +56,7 @@ template <typename FImpl>
 class TDWF: public Module<DWFPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TDWF(const std::string name);
@@ -73,6 +73,7 @@ protected:
 };

 MODULE_REGISTER_TMP(DWF, TDWF<FIMPL>, MAction);
+MODULE_REGISTER_TMP(DWFF, TDWF<FIMPLF>, MAction);

 /******************************************************************************
 *                        DWF template implementation                         *
@@ -111,12 +112,11 @@ void TDWF<FImpl>::setup(void)
    LOG(Message) << "Fermion boundary conditions: " << par().boundary
                 << std::endl;
                 
-    env().createGrid(par().Ls);
-    auto &U    = envGet(LatticeGaugeField, par().gauge);
-    auto &g4   = *env().getGrid();
-    auto &grb4 = *env().getRbGrid();
-    auto &g5   = *env().getGrid(par().Ls);
-    auto &grb5 = *env().getRbGrid(par().Ls);
+    auto &U    = envGet(GaugeField, par().gauge);
+    auto &g4   = *envGetGrid(FermionField);
+    auto &grb4 = *envGetRbGrid(FermionField);
+    auto &g5   = *envGetGrid(FermionField, par().Ls);
+    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename DomainWallFermion<FImpl>::ImplParams implParams(boundary);
    envCreateDerived(FMat, DomainWallFermion<FImpl>, getName(), par().Ls, U, g5,
--- a/Hadrons/Modules/MAction/MobiusDWF.cc
+++ b/Hadrons/Modules/MAction/MobiusDWF.cc
@@ -32,3 +32,4 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPL>;
+template class Grid::Hadrons::MAction::TMobiusDWF<FIMPLF>;
--- a/Hadrons/Modules/MAction/MobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/MobiusDWF.hpp
@@ -56,7 +56,7 @@ template <typename FImpl>
 class TMobiusDWF: public Module<MobiusDWFPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TMobiusDWF(const std::string name);
@@ -72,6 +72,7 @@ public:
 };

 MODULE_REGISTER_TMP(MobiusDWF, TMobiusDWF<FIMPL>, MAction);
+MODULE_REGISTER_TMP(MobiusDWFF, TMobiusDWF<FIMPLF>, MAction);

 /******************************************************************************
 *                      TMobiusDWF implementation                             *
@@ -111,12 +112,11 @@ void TMobiusDWF<FImpl>::setup(void)
    LOG(Message) << "Fermion boundary conditions: " << par().boundary
                 << std::endl;

-    env().createGrid(par().Ls);
-    auto &U    = envGet(LatticeGaugeField, par().gauge);
-    auto &g4   = *env().getGrid();
-    auto &grb4 = *env().getRbGrid();
-    auto &g5   = *env().getGrid(par().Ls);
-    auto &grb5 = *env().getRbGrid(par().Ls);
+    auto &U    = envGet(GaugeField, par().gauge);
+    auto &g4   = *envGetGrid(FermionField);
+    auto &grb4 = *envGetRbGrid(FermionField);
+    auto &g5   = *envGetGrid(FermionField, par().Ls);
+    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename MobiusFermion<FImpl>::ImplParams implParams(boundary);
    envCreateDerived(FMat, MobiusFermion<FImpl>, getName(), par().Ls, U, g5,
--- a/Hadrons/Modules/MAction/ScaledDWF.cc
+++ b/Hadrons/Modules/MAction/ScaledDWF.cc
@@ -32,3 +32,4 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TScaledDWF<FIMPL>;
+template class Grid::Hadrons::MAction::TScaledDWF<FIMPLF>;
--- a/Hadrons/Modules/MAction/ScaledDWF.hpp
+++ b/Hadrons/Modules/MAction/ScaledDWF.hpp
@@ -55,7 +55,7 @@ template <typename FImpl>
 class TScaledDWF: public Module<ScaledDWFPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TScaledDWF(const std::string name);
@@ -71,6 +71,7 @@ public:
 };

 MODULE_REGISTER_TMP(ScaledDWF, TScaledDWF<FIMPL>, MAction);
+MODULE_REGISTER_TMP(ScaledDWFF, TScaledDWF<FIMPLF>, MAction);

 /******************************************************************************
 *                      TScaledDWF implementation                             *
@@ -110,12 +111,11 @@ void TScaledDWF<FImpl>::setup(void)
    LOG(Message) << "Fermion boundary conditions: " << par().boundary
                 << std::endl;

-    env().createGrid(par().Ls);
-    auto &U    = envGet(LatticeGaugeField, par().gauge);
-    auto &g4   = *env().getGrid();
-    auto &grb4 = *env().getRbGrid();
-    auto &g5   = *env().getGrid(par().Ls);
-    auto &grb5 = *env().getRbGrid(par().Ls);
+    auto &U    = envGet(GaugeField, par().gauge);
+    auto &g4   = *envGetGrid(FermionField);
+    auto &grb4 = *envGetRbGrid(FermionField);
+    auto &g5   = *envGetGrid(FermionField, par().Ls);
+    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename MobiusFermion<FImpl>::ImplParams implParams(boundary);
    envCreateDerived(FMat, ScaledShamirFermion<FImpl>, getName(), par().Ls, U, g5,
--- a/Hadrons/Modules/MAction/Wilson.cc
+++ b/Hadrons/Modules/MAction/Wilson.cc
@@ -32,4 +32,4 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TWilson<FIMPL>;
-
+template class Grid::Hadrons::MAction::TWilson<FIMPLF>;
--- a/Hadrons/Modules/MAction/Wilson.hpp
+++ b/Hadrons/Modules/MAction/Wilson.hpp
@@ -54,7 +54,7 @@ template <typename FImpl>
 class TWilson: public Module<WilsonPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWilson(const std::string name);
@@ -71,6 +71,7 @@ protected:
 };

 MODULE_REGISTER_TMP(Wilson, TWilson<FIMPL>, MAction);
+MODULE_REGISTER_TMP(WilsonF, TWilson<FIMPLF>, MAction);

 /******************************************************************************
 *                     TWilson template implementation                        *
@@ -107,9 +108,9 @@ void TWilson<FImpl>::setup(void)
    LOG(Message) << "Fermion boundary conditions: " << par().boundary
                 << std::endl;
                 
-    auto &U      = envGet(LatticeGaugeField, par().gauge);
-    auto &grid   = *env().getGrid();
-    auto &gridRb = *env().getRbGrid();
+    auto &U      = envGet(GaugeField, par().gauge);
+    auto &grid   = *envGetGrid(FermionField);
+    auto &gridRb = *envGetRbGrid(FermionField);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename WilsonFermion<FImpl>::ImplParams implParams(boundary);
    envCreateDerived(FMat, WilsonFermion<FImpl>, getName(), 1, U, grid, gridRb,
--- a/Hadrons/Modules/MAction/WilsonClover.cc
+++ b/Hadrons/Modules/MAction/WilsonClover.cc
@@ -32,4 +32,4 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TWilsonClover<FIMPL>;
-
+template class Grid::Hadrons::MAction::TWilsonClover<FIMPLF>;
--- a/Hadrons/Modules/MAction/WilsonClover.hpp
+++ b/Hadrons/Modules/MAction/WilsonClover.hpp
@@ -59,7 +59,7 @@ template <typename FImpl>
 class TWilsonClover: public Module<WilsonCloverPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWilsonClover(const std::string name);
@@ -75,6 +75,7 @@ public:
 };

 MODULE_REGISTER_TMP(WilsonClover, TWilsonClover<FIMPL>, MAction);
+MODULE_REGISTER_TMP(WilsonCloverF, TWilsonClover<FIMPLF>, MAction);

 /******************************************************************************
 *                    TWilsonClover template implementation                   *
@@ -113,16 +114,14 @@ void TWilsonClover<FImpl>::setup(void)
    LOG(Message) << "Clover term csw_r: " << par().csw_r
                 << " csw_t: " << par().csw_t
                 << std::endl;
-    auto &U      = envGet(LatticeGaugeField, par().gauge);
-    auto &grid   = *env().getGrid();
-    auto &gridRb = *env().getRbGrid();
+    auto &U      = envGet(GaugeField, par().gauge);
+    auto &grid   = *envGetGrid(FermionField);
+    auto &gridRb = *envGetRbGrid(FermionField);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename WilsonCloverFermion<FImpl>::ImplParams implParams(boundary);
-    envCreateDerived(FMat, WilsonCloverFermion<FImpl>, getName(), 1, U, grid, gridRb, par().mass,
-						  par().csw_r,
-						  par().csw_t,
-					      par().clover_anisotropy,
-						  implParams); 
+    envCreateDerived(FMat, WilsonCloverFermion<FImpl>, getName(), 1, U, grid,
+                     gridRb, par().mass, par().csw_r, par().csw_t, 
+                     par().clover_anisotropy, implParams); 
 }

 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/Modules/MAction/ZMobiusDWF.cc
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.cc
@@ -32,4 +32,4 @@ using namespace Hadrons;
 using namespace MAction;

 template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPL>;
-
+template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPLF>;
--- a/Hadrons/Modules/MAction/ZMobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.hpp
@@ -57,7 +57,7 @@ template <typename FImpl>
 class TZMobiusDWF: public Module<ZMobiusDWFPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TZMobiusDWF(const std::string name);
@@ -73,6 +73,7 @@ public:
 };

 MODULE_REGISTER_TMP(ZMobiusDWF, TZMobiusDWF<ZFIMPL>, MAction);
+MODULE_REGISTER_TMP(ZMobiusDWFF, TZMobiusDWF<ZFIMPLF>, MAction);

 /******************************************************************************
 *                     TZMobiusDWF implementation                             *
@@ -118,11 +119,11 @@ void TZMobiusDWF<FImpl>::setup(void)
                 << std::endl;

    env().createGrid(par().Ls);
-    auto &U    = envGet(LatticeGaugeField, par().gauge);
-    auto &g4   = *env().getGrid();
-    auto &grb4 = *env().getRbGrid();
-    auto &g5   = *env().getGrid(par().Ls);
-    auto &grb5 = *env().getRbGrid(par().Ls);
+    auto &U    = envGet(GaugeField, par().gauge);
+    auto &g4   = *envGetGrid(FermionField);
+    auto &grb4 = *envGetRbGrid(FermionField);
+    auto &g5   = *envGetGrid(FermionField, par().Ls);
+    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    auto omega = par().omega;
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename ZMobiusFermion<FImpl>::ImplParams implParams(boundary);
--- a/Hadrons/Modules/MContraction/A2AAslashField.cc
+++ b/Hadrons/Modules/MContraction/A2AAslashField.cc
@@ -0,0 +1,7 @@
+#include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MContraction;
+
+template class Grid::Hadrons::MContraction::TA2AAslashField<FIMPL, PhotonR>;
--- a/Hadrons/Modules/MContraction/A2AAslashField.hpp
+++ b/Hadrons/Modules/MContraction/A2AAslashField.hpp
@@ -0,0 +1,223 @@
+#ifndef Hadrons_MContraction_A2AAslashField_hpp_
+#define Hadrons_MContraction_A2AAslashField_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+
+#ifndef ASF_IO_TYPE
+#define ASF_IO_TYPE ComplexF
+#endif
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         A2AAslashField                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MContraction)
+
+class A2AAslashFieldPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashFieldPar,
+                                    int, cacheBlock,
+                                    int, block,
+                                    std::string, left,
+                                    std::string, right,
+                                    std::string, output,
+                                    std::vector<std::string>, emField);
+};
+
+class A2AAslashFieldMetadata: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashFieldMetadata,
+                                    std::string, emFieldName);
+};
+
+template <typename T, typename FImpl>
+class AslashFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
+{
+public:
+    typedef typename FImpl::FermionField FermionField;
+public:
+    AslashFieldKernel(const std::vector<LatticeComplex> &emB0,
+                      const std::vector<LatticeComplex> &emB1,
+                      GridBase *grid)
+    : emB0_(emB0), emB1_(emB1), grid_(grid)
+    {
+        vol_ = 1.;
+        for (auto &d: grid_->GlobalDimensions())
+        {
+            vol_ *= d;
+        }
+    }
+
+    virtual ~AslashFieldKernel(void) = default;
+    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
+                            const FermionField *right,
+                            const unsigned int orthogDim, double &t)
+    {
+        A2Autils<FImpl>::AslashField(m, left, right, emB0_, emB1_, orthogDim, &t);
+    }
+
+    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
+    {
+        return 0.;
+    }
+
+    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
+    {
+        return 0.;
+    }
+private:
+    const std::vector<LatticeComplex> &emB0_, &emB1_;
+    GridBase                          *grid_;
+    double                            vol_;
+};
+
+template <typename FImpl, typename PhotonImpl>
+class TA2AAslashField: public Module<A2AAslashFieldPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    typedef typename PhotonImpl::GaugeField EmField;
+    typedef A2AMatrixBlockComputation<Complex, 
+                                      FermionField, 
+                                      A2AAslashFieldMetadata, 
+                                      ASF_IO_TYPE> Computation;
+    typedef AslashFieldKernel<Complex, FImpl> Kernel;
+public:
+    // constructor
+    TA2AAslashField(const std::string name);
+    // destructor
+    virtual ~TA2AAslashField(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(A2AAslashField, ARG(TA2AAslashField<FIMPL, PhotonR>), MContraction);
+
+/******************************************************************************
+ *                 TA2AAslashField implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl, typename PhotonImpl>
+TA2AAslashField<FImpl, PhotonImpl>::TA2AAslashField(const std::string name)
+: Module<A2AAslashFieldPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl, typename PhotonImpl>
+std::vector<std::string> TA2AAslashField<FImpl, PhotonImpl>::getInput(void)
+{
+    std::vector<std::string> in = par().emField;
+    
+    in.push_back(par().left);
+    in.push_back(par().right);
+
+    return in;
+}
+
+template <typename FImpl, typename PhotonImpl>
+std::vector<std::string> TA2AAslashField<FImpl, PhotonImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl, typename PhotonImpl>
+void TA2AAslashField<FImpl, PhotonImpl>::setup(void)
+{
+    envTmp(Computation, "computation", 1, envGetGrid(FermionField), 
+           env().getNd() - 1, par().emField.size(), 1, par().block, 
+           par().cacheBlock, this);
+    envTmp(std::vector<ComplexField>, "B0", 1, 
+           par().emField.size(), envGetGrid(ComplexField));
+    envTmp(std::vector<ComplexField>, "B1", 1, 
+           par().emField.size(), envGetGrid(ComplexField));
+    envTmpLat(ComplexField, "Amu");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl, typename PhotonImpl>
+void TA2AAslashField<FImpl, PhotonImpl>::execute(void)
+{
+    auto &left  = envGet(std::vector<FermionField>, par().left);
+    auto &right = envGet(std::vector<FermionField>, par().right);
+
+    int nt         = env().getDim().back();
+    int N_i        = left.size();
+    int N_j        = right.size();
+    int nem        = par().emField.size();
+    int block      = par().block;
+    int cacheBlock = par().cacheBlock;
+
+    LOG(Message) << "Computing all-to-all A-slash fields" << std::endl;
+    LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
+    LOG(Message) << "EM fields:" << std::endl;
+    for (auto &name: par().emField)
+    {
+        LOG(Message) << "  " << name << std::endl;
+    }
+    LOG(Message) << "A-slash field size: " << nt << "*" << N_i << "*" << N_j 
+                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(ASF_IO_TYPE)) 
+                 << "/EM field)" << std::endl;
+    
+    // preparing "B" complexified fields
+    startTimer("Complexify EM fields");
+    envGetTmp(std::vector<ComplexField>, B0);
+    envGetTmp(std::vector<ComplexField>, B1);
+    for (unsigned int i = 0; i < par().emField.size(); ++i)
+    {
+        auto &A = envGet(EmField, par().emField[i]);
+        envGetTmp(ComplexField, Amu);
+
+        B0[i]  = peekLorentz(A, 0);
+        B0[i] += timesI(peekLorentz(A, 1));
+        B1[i]  = peekLorentz(A, 2);
+        B1[i] += timesI(peekLorentz(A, 3));
+    }
+    stopTimer("Complexify EM fields");
+
+    // I/O name & metadata lambdas
+    auto ionameFn = [this](const unsigned int em, const unsigned int dummy)
+    {
+        return par().emField[em];
+    };
+
+    auto filenameFn = [this, &ionameFn](const unsigned int em, const unsigned int dummy)
+    {
+        return par().output + "." + std::to_string(vm().getTrajectory()) 
+               + "/" + ionameFn(em, dummy) + ".h5";
+    };
+
+    auto metadataFn = [this](const unsigned int em, const unsigned int dummy)
+    {
+        A2AAslashFieldMetadata md;
+
+        md.emFieldName = par().emField[em];
+        
+        return md;
+    };
+
+    // executing computation
+    Kernel kernel(B0, B1, envGetGrid(FermionField));
+
+    envGetTmp(Computation, computation);
+    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MContraction_A2AAslashField_hpp_
--- a/Hadrons/Modules/MContraction/A2AMesonField.cc
+++ b/Hadrons/Modules/MContraction/A2AMesonField.cc
@@ -33,4 +33,3 @@ using namespace Hadrons;
 using namespace MContraction;

 template class Grid::Hadrons::MContraction::TA2AMesonField<FIMPL>;
-template class Grid::Hadrons::MContraction::TA2AMesonField<ZFIMPL>;
--- a/Hadrons/Modules/MContraction/A2AMesonField.hpp
+++ b/Hadrons/Modules/MContraction/A2AMesonField.hpp
@@ -33,12 +33,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
-#include <Hadrons/A2AVectors.hpp>
 #include <Hadrons/A2AMatrix.hpp>
-#include <Hadrons/Modules/MSolver/A2AVectors.hpp>
-#include <Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp>

-#define MF_PARALLEL_IO
 #ifndef MF_IO_TYPE
 #define MF_IO_TYPE ComplexF
 #endif
@@ -56,8 +52,8 @@ public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldPar,
                                    int, cacheBlock,
                                    int, block,
-                                    std::string, v,
-                                    std::string, w,
+                                    std::string, left,
+                                    std::string, right,
                                    std::string, output,
                                    std::string, gammas,
                                    std::vector<std::string>, mom);
@@ -71,21 +67,59 @@ public:
                                    Gamma::Algebra, gamma);
 };

+template <typename T, typename FImpl>
+class MesonFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
+{
+public:
+    typedef typename FImpl::FermionField FermionField;
+public:
+    MesonFieldKernel(const std::vector<Gamma::Algebra> &gamma,
+                     const std::vector<LatticeComplex> &mom,
+                     GridBase *grid)
+    : gamma_(gamma), mom_(mom), grid_(grid)
+    {
+        vol_ = 1.;
+        for (auto &d: grid_->GlobalDimensions())
+        {
+            vol_ *= d;
+        }
+    }
+
+    virtual ~MesonFieldKernel(void) = default;
+    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
+                            const FermionField *right,
+                            const unsigned int orthogDim, double &t)
+    {
+        A2Autils<FImpl>::MesonField(m, left, right, gamma_, mom_, orthogDim, &t);
+    }
+
+    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
+    {
+        return vol_*(2*8.0+6.0+8.0*mom_.size())*blockSizei*blockSizej*gamma_.size();
+    }
+
+    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
+    {
+        return vol_*(12.0*sizeof(T))*blockSizei*blockSizej
+               +  vol_*(2.0*sizeof(T)*mom_.size())*blockSizei*blockSizej*gamma_.size();
+    }
+private:
+    const std::vector<Gamma::Algebra> &gamma_;
+    const std::vector<LatticeComplex> &mom_;
+    GridBase                          *grid_;
+    double                            vol_;
+};
+
 template <typename FImpl>
 class TA2AMesonField : public Module<A2AMesonFieldPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
-    SOLVER_TYPE_ALIASES(FImpl,);
-    typedef Eigen::TensorMap<Eigen::Tensor<Complex, 5, Eigen::RowMajor>>    MesonField;
-    typedef Eigen::TensorMap<Eigen::Tensor<MF_IO_TYPE, 5, Eigen::RowMajor>> MesonFieldIo;
-    typedef A2AMatrixIo<MF_IO_TYPE, A2AMesonFieldMetadata>                  MatrixIo;
-    struct IoHelper
-    {
-        MatrixIo              io;
-        A2AMesonFieldMetadata metadata;
-        size_t                offset;
-    };
+    typedef A2AMatrixBlockComputation<Complex, 
+                                      FermionField, 
+                                      A2AMesonFieldMetadata, 
+                                      MF_IO_TYPE> Computation;
+    typedef MesonFieldKernel<Complex, FImpl> Kernel;
 public:
    // constructor
    TA2AMesonField(const std::string name);
@@ -98,21 +132,14 @@ public:
    virtual void setup(void);
    // execution
    virtual void execute(void);
-private:
-    // IO
-    std::string ioname(unsigned int m, unsigned int g) const;
-    std::string filename(unsigned int m, unsigned int g) const;
-    void saveBlock(const MF_IO_TYPE *data, IoHelper &h, unsigned int i, unsigned int j);
 private:
    bool                               hasPhase_{false};
    std::string                        momphName_;
    std::vector<Gamma::Algebra>        gamma_;
    std::vector<std::vector<Real>>     mom_;
-    std::vector<IoHelper>                              nodeIo_;
 };

 MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField<FIMPL>), MContraction);
-MODULE_REGISTER(ZA2AMesonField, ARG(TA2AMesonField<ZFIMPL>), MContraction);

 /******************************************************************************
 *                  TA2AMesonField implementation                             *
@@ -129,7 +156,7 @@ TA2AMesonField<FImpl>::TA2AMesonField(const std::string name)
 template <typename FImpl>
 std::vector<std::string> TA2AMesonField<FImpl>::getInput(void)
 {
-    std::vector<std::string> in = {par().v, par().w};
+    std::vector<std::string> in = {par().left, par().right};

    return in;
 }
@@ -185,34 +212,31 @@ void TA2AMesonField<FImpl>::setup(void)
        }
        mom_.push_back(p);
    }
-    
-    envCache(std::vector<LatticeComplex>, momphName_, 1, 
-             par().mom.size(), env().getGrid());
-    envTmpLat(LatticeComplex, "coor");
-    // preallocate memory for meson field block
-    auto tgp = env().getDim().back()*gamma_.size()*mom_.size();
-
-    envTmp(Vector<MF_IO_TYPE>, "mfBuf", 1, tgp*par().block*par().block);
-    envTmp(Vector<Complex>, "mfCache", 1, tgp*par().cacheBlock*par().cacheBlock);
+    envCache(std::vector<ComplexField>, momphName_, 1, 
+             par().mom.size(), envGetGrid(ComplexField));
+    envTmpLat(ComplexField, "coor");
+    envTmp(Computation, "computation", 1, envGetGrid(FermionField), 
+           env().getNd() - 1, mom_.size(), gamma_.size(), par().block, 
+           par().cacheBlock, this);
 }

 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TA2AMesonField<FImpl>::execute(void)
 {
-    auto &v = envGet(std::vector<FermionField>, par().v);
-    auto &w = envGet(std::vector<FermionField>, par().w);
+    auto &left  = envGet(std::vector<FermionField>, par().left);
+    auto &right = envGet(std::vector<FermionField>, par().right);

    int nt         = env().getDim().back();
-    int N_i        = w.size();
-    int N_j        = v.size();
+    int N_i        = left.size();
+    int N_j        = right.size();
    int ngamma     = gamma_.size();
    int nmom       = mom_.size();
    int block      = par().block;
    int cacheBlock = par().cacheBlock;

    LOG(Message) << "Computing all-to-all meson fields" << std::endl;
-    LOG(Message) << "W: '" << par().w << "' V: '" << par().v << "'" << std::endl;
+    LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
    LOG(Message) << "Momenta:" << std::endl;
    for (auto &p: mom_)
    {
@@ -227,10 +251,7 @@ void TA2AMesonField<FImpl>::execute(void)
                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(MF_IO_TYPE)) 
                 << "/momentum/bilinear)" << std::endl;

-    ///////////////////////////////////////////////
-    // Momentum setup
-    ///////////////////////////////////////////////
-    auto &ph = envGet(std::vector<LatticeComplex>, momphName_);
+    auto &ph = envGet(std::vector<ComplexField>, momphName_);

    if (!hasPhase_)
    {
@@ -240,7 +261,7 @@ void TA2AMesonField<FImpl>::execute(void)
            Complex           i(0.0,1.0);
            std::vector<Real> p;

-            envGetTmp(LatticeComplex, coor);
+            envGetTmp(ComplexField, coor);
            ph[j] = zero;
            for(unsigned int mu = 0; mu < mom_[j].size(); mu++)
            {
@@ -253,150 +274,8 @@ void TA2AMesonField<FImpl>::execute(void)
        stopTimer("Momentum phases");
    }

-    //////////////////////////////////////////////////////////////////////////
-    // i,j   is first  loop over SchurBlock factors reusing 5D matrices
-    // ii,jj is second loop over cacheBlock factors for high perf contractoin
-    // iii,jjj are loops within cacheBlock
-    // Total index is sum of these  i+ii+iii etc...
-    //////////////////////////////////////////////////////////////////////////
-    
-    double flops;
-    double bytes;
-    double vol      = env().getVolume();
-    double t_kernel = 0.0;
-    double nodes    = env().getGrid()->NodeCount();
-    double tot_kernel;
-
-    envGetTmp(Vector<MF_IO_TYPE>, mfBuf);
-    envGetTmp(Vector<Complex>, mfCache);
-    
-    double t0    = usecond();
-    int NBlock_i = N_i/block + (((N_i % block) != 0) ? 1 : 0);
-    int NBlock_j = N_j/block + (((N_j % block) != 0) ? 1 : 0);
-
-    for(int i=0;i<N_i;i+=block)
-    for(int j=0;j<N_j;j+=block)
+    auto ionameFn = [this](const unsigned int m, const unsigned int g)
    {
-        // Get the W and V vectors for this block^2 set of terms
-        int N_ii = MIN(N_i-i,block);
-        int N_jj = MIN(N_j-j,block);
-
-        LOG(Message) << "Meson field block " 
-                    << j/block + NBlock_j*i/block + 1 
-                    << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
-                    << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
-                    << std::endl;
-
-        MesonFieldIo mfBlock(mfBuf.data(),nmom,ngamma,nt,N_ii,N_jj);
-
-        // Series of cache blocked chunks of the contractions within this block
-        flops = 0.0;
-        bytes = 0.0;
-        for(int ii=0;ii<N_ii;ii+=cacheBlock)
-        for(int jj=0;jj<N_jj;jj+=cacheBlock)
-        {
-            int N_iii = MIN(N_ii-ii,cacheBlock);
-            int N_jjj = MIN(N_jj-jj,cacheBlock);
-            MesonField mfCacheBlock(mfCache.data(),nmom,ngamma,nt,N_iii,N_jjj);    
-
-            startTimer("contraction: total");
-            makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph, 
-                                env().getNd() - 1, this);
-            stopTimer("contraction: total");
-            
-            // flops for general N_c & N_s
-            flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma;
-            bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
-                     +  vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;
-
-            startTimer("cache copy");
-            parallel_for_nest5(int m =0;m< nmom;m++)
-            for(int g =0;g< ngamma;g++)
-            for(int t =0;t< nt;t++)
-            for(int iii=0;iii< N_iii;iii++)
-            for(int jjj=0;jjj< N_jjj;jjj++)
-            {
-                mfBlock(m,g,t,ii+iii,jj+jjj) = mfCacheBlock(m,g,t,iii,jjj);
-            }
-            stopTimer("cache copy");
-        }
-
-        // perf
-        tot_kernel = getDTimer("contraction: colour trace & mom.")
-                     + getDTimer("contraction: local space sum");
-        t_kernel   = tot_kernel - t_kernel;
-        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
-                     << " Gflop/s/node " << std::endl;
-        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
-                     << " GB/s/node "  << std::endl;
-        t_kernel = tot_kernel;
-
-        // IO
-        if (!par().output.empty())
-        {
-            double       blockSize, ioTime;
-            unsigned int myRank = env().getGrid()->ThisRank(),
-                         nRank  = env().getGrid()->RankCount();
-        
-            LOG(Message) << "Writing block to disk" << std::endl;
-            ioTime = -getDTimer("IO: write block");
-            startTimer("IO: total");
-            makeFileDir(filename(0, 0), env().getGrid());
-#ifdef MF_PARALLEL_IO
-            env().getGrid()->Barrier();
-            nodeIo_.clear();
-            for(int f = myRank; f < nmom*ngamma; f += nRank)
-            {
-                const unsigned int    m = f/ngamma, g = f % ngamma;
-                IoHelper              h;
-
-                h.io = MatrixIo(filename(m, g), ioname(m, g), nt, N_i, N_j, block);
-                for (auto pmu: mom_[m])
-                {
-                    h.metadata.momentum.push_back(pmu);
-                }
-                h.metadata.gamma = gamma_[g];
-                h.offset         = (m*ngamma + g)*nt*block*block;
-                nodeIo_.push_back(h);
-            }
-            // parallel IO
-            for (auto &h: nodeIo_)
-            {
-                saveBlock(mfBlock.data(), h, i, j);
-            }
-            env().getGrid()->Barrier();
-#else
-            // serial IO
-            for(int m = 0; m < nmom; m++)
-            for(int g = 0; g < ngamma; g++)
-            {
-                IoHelper h;
-
-                h.io = MatrixIo(filename(m, g), ioname(m, g), nt, N_i, N_j, block);
-                for (auto pmu: mom_[m])
-                {
-                    h.metadata.momentum.push_back(pmu);
-                }
-                h.metadata.gamma = gamma_[g];
-                h.offset         = (m*ngamma + g)*nt*block*block;
-                saveBlock(mfBlock.data(), h, i, j);
-            }
-#endif
-            stopTimer("IO: total");
-            blockSize  = static_cast<double>(nmom*ngamma*nt*N_ii*N_jj*sizeof(MF_IO_TYPE));
-            ioTime    += getDTimer("IO: write block");
-            LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
-                         << ioTime  << " us (" 
-                         << blockSize/ioTime*1.0e6/1024/1024
-                         << " MB/s)" << std::endl;
-        }
-    }
-}
-
-// IO
-template <typename FImpl>
-std::string TA2AMesonField<FImpl>::ioname(unsigned int m, unsigned int g) const
-{
        std::stringstream ss;

        ss << gamma_[g] << "_";
@@ -406,28 +285,31 @@ std::string TA2AMesonField<FImpl>::ioname(unsigned int m, unsigned int g) const
        }

        return ss.str();
-}
+    };

-template <typename FImpl>
-std::string TA2AMesonField<FImpl>::filename(unsigned int m, unsigned int g) const
-{
-    return par().output + "." + std::to_string(vm().getTrajectory()) 
-           + "/" + ioname(m, g) + ".h5";
-}
-
-template <typename FImpl>
-void TA2AMesonField<FImpl>::saveBlock(const MF_IO_TYPE *data, IoHelper &h, 
-                                      unsigned int i, unsigned int j)
-{
-    if ((i == 0) and (j == 0))
+    auto filenameFn = [this, &ionameFn](const unsigned int m, const unsigned int g)
    {
-        startTimer("IO: file creation");
-        h.io.initFile(h.metadata);
-        stopTimer("IO: file creation");
+        return par().output + "." + std::to_string(vm().getTrajectory()) 
+               + "/" + ionameFn(m, g) + ".h5";
+    };
+
+    auto metadataFn = [this](const unsigned int m, const unsigned int g)
+    {
+        A2AMesonFieldMetadata md;
+
+        for (auto pmu: mom_[m])
+        {
+            md.momentum.push_back(pmu);
        }
-    startTimer("IO: write block");
-    h.io.saveBlock(data + h.offset, i, j);
-    stopTimer("IO: write block");
+        md.gamma = gamma_[g];
+        
+        return md;
+    };
+
+    Kernel      kernel(gamma_, ph, envGetGrid(FermionField));
+
+    envGetTmp(Computation, computation);
+    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
 }

 END_MODULE_NAMESPACE
--- a/Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
+++ b/Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
@@ -1,224 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
-
-Copyright (C) 2015-2018
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef Hadrons_MContraction_A2AMesonFieldKernels_hpp_
-#define Hadrons_MContraction_A2AMesonFieldKernels_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-#include <Grid/Eigen/unsupported/CXX11/Tensor>
-
-BEGIN_HADRONS_NAMESPACE
-
-BEGIN_MODULE_NAMESPACE(MContraction)
-
-////////////////////////////////////////////////////////////////////////////////
-// Cache blocked arithmetic routine
-// Could move to Grid ???
-////////////////////////////////////////////////////////////////////////////////
-template <typename Field, typename MesonField>
-void makeMesonFieldBlock(MesonField &mat, 
-                         const Field *lhs_wi,
-                         const Field *rhs_vj,
-                         std::vector<Gamma::Algebra> gamma,
-                         const std::vector<LatticeComplex> &mom,
-                         int orthogdim,
-                         ModuleBase *caller = nullptr) 
-{
-    typedef typename Field::vector_object vobj;
-    typedef typename vobj::scalar_object  sobj;
-    typedef typename vobj::scalar_type    scalar_type;
-    typedef typename vobj::vector_type    vector_type;
-
-    typedef iSpinMatrix<vector_type> SpinMatrix_v;
-    typedef iSpinMatrix<scalar_type> SpinMatrix_s;
-    
-    int Lblock = mat.dimension(3); 
-    int Rblock = mat.dimension(4);
-
-    GridBase *grid = lhs_wi[0]._grid;
-    
-    const int    Nd = grid->_ndimension;
-    const int Nsimd = grid->Nsimd();
-
-    int Nt     = grid->GlobalDimensions()[orthogdim];
-    int Ngamma = gamma.size();
-    int Nmom   = mom.size();
-
-    int fd=grid->_fdimensions[orthogdim];
-    int ld=grid->_ldimensions[orthogdim];
-    int rd=grid->_rdimensions[orthogdim];
-
-    // will locally sum vectors first
-    // sum across these down to scalars
-    // splitting the SIMD
-    int MFrvol = rd*Lblock*Rblock*Nmom;
-    int MFlvol = ld*Lblock*Rblock*Nmom;
-
-    Vector<SpinMatrix_v > lvSum(MFrvol);
-    parallel_for (int r = 0; r < MFrvol; r++)
-    {
-        lvSum[r] = zero;
-    }
-
-    Vector<SpinMatrix_s > lsSum(MFlvol);             
-    parallel_for (int r = 0; r < MFlvol; r++)
-    {
-        lsSum[r]=scalar_type(0.0);
-    }
-
-    int e1=    grid->_slice_nblock[orthogdim];
-    int e2=    grid->_slice_block [orthogdim];
-    int stride=grid->_slice_stride[orthogdim];
-
-    if (caller) caller->startTimer("contraction: colour trace & mom.");
-    // Nested parallelism would be ok
-    // Wasting cores here. Test case r
-    parallel_for(int r=0;r<rd;r++)
-    {
-        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-        for(int n=0;n<e1;n++)
-        for(int b=0;b<e2;b++)
-        {
-            int ss= so+n*stride+b;
-
-            for(int i=0;i<Lblock;i++)
-            {
-                auto left = conjugate(lhs_wi[i]._odata[ss]);
-
-                for(int j=0;j<Rblock;j++)
-                {
-                    SpinMatrix_v vv;
-                    auto right = rhs_vj[j]._odata[ss];
-
-                    for(int s1=0;s1<Ns;s1++)
-                    for(int s2=0;s2<Ns;s2++)
-                    {
-                        vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
-                                        + left()(s2)(1) * right()(s1)(1)
-                                        + left()(s2)(2) * right()(s1)(2);
-                    }
-                    
-                    // After getting the sitewise product do the mom phase loop
-                    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
-
-                    for ( int m=0;m<Nmom;m++)
-                    {
-                        int idx = m+base;
-                        auto phase = mom[m]._odata[ss];
-                        mac(&lvSum[idx],&vv,&phase);
-                    }
-                }
-            }
-        }
-    }
-    if (caller) caller->stopTimer("contraction: colour trace & mom.");
-
-    // Sum across simd lanes in the plane, breaking out orthog dir.
-    if (caller) caller->startTimer("contraction: local space sum");
-    parallel_for(int rt=0;rt<rd;rt++)
-    {
-        std::vector<int> icoor(Nd);
-        std::vector<SpinMatrix_s> extracted(Nsimd);               
-
-        for(int i=0;i<Lblock;i++)
-        for(int j=0;j<Rblock;j++)
-        for(int m=0;m<Nmom;m++)
-        {
-
-            int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
-
-            extract(lvSum[ij_rdx],extracted);
-            for(int idx=0;idx<Nsimd;idx++)
-            {
-                grid->iCoorFromIindex(icoor,idx);
-
-                int ldx    = rt+icoor[orthogdim]*rd;
-                int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
-
-                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
-            }
-        }
-    }
-    if (caller) caller->stopTimer("contraction: local space sum");
-
-    // ld loop and local only??
-    if (caller) caller->startTimer("contraction: spin trace");
-    int pd = grid->_processors[orthogdim];
-    int pc = grid->_processor_coor[orthogdim];
-    parallel_for_nest2(int lt=0;lt<ld;lt++)
-    {
-        for(int pt=0;pt<pd;pt++)
-        {
-            int t = lt + pt*ld;
-            if (pt == pc)
-            {
-                for(int i=0;i<Lblock;i++)
-                for(int j=0;j<Rblock;j++)
-                for(int m=0;m<Nmom;m++)
-                {
-                    int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
-
-                    for(int mu=0;mu<Ngamma;mu++)
-                    {
-                        // this is a bit slow
-                        mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gamma[mu]));
-                    }
-                }
-            } 
-            else 
-            { 
-                const scalar_type zz(0.0);
-
-                for(int i=0;i<Lblock;i++)
-                for(int j=0;j<Rblock;j++)
-                for(int mu=0;mu<Ngamma;mu++)
-                for(int m=0;m<Nmom;m++)
-                {
-                    mat(m,mu,t,i,j) =zz;
-                }
-            }
-        }
-    }
-    if (caller) caller->stopTimer("contraction: spin trace");
-    ////////////////////////////////////////////////////////////////////
-    // This global sum is taking as much as 50% of time on 16 nodes
-    // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
-    // Healthy size that should suffice
-    ////////////////////////////////////////////////////////////////////
-    if (caller) caller->startTimer("contraction: global sum");
-    grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
-    if (caller) caller->stopTimer("contraction: global sum");
-}
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif //Hadrons_MContraction_A2AMesonField_hpp_
--- a/Hadrons/Modules/MContraction/Meson.hpp
+++ b/Hadrons/Modules/MContraction/Meson.hpp
@@ -77,7 +77,7 @@ class TMeson: public Module<MesonPar>
 public:
    FERM_TYPE_ALIASES(FImpl1, 1);
    FERM_TYPE_ALIASES(FImpl2, 2);
-    FERM_TYPE_ALIASES(ScalarImplCR, Scalar);
+    BASIC_TYPE_ALIASES(ScalarImplCR, Scalar);
    SINK_TYPE_ALIASES(Scalar);
    class Result: Serializable
    {
--- a/Hadrons/Modules/MFermion/FreeProp.hpp
+++ b/Hadrons/Modules/MFermion/FreeProp.hpp
@@ -56,7 +56,7 @@ template <typename FImpl>
 class TFreeProp: public Module<FreePropPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TFreeProp(const std::string name);
--- a/Hadrons/Modules/MFermion/GaugeProp.hpp
+++ b/Hadrons/Modules/MFermion/GaugeProp.hpp
@@ -56,7 +56,7 @@ template <typename FImpl>
 class TGaugeProp: public Module<GaugePropPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
    SOLVER_TYPE_ALIASES(FImpl,);
 public:
    // constructor
--- a/Hadrons/Modules/MGauge/Random.cc
+++ b/Hadrons/Modules/MGauge/Random.cc
@@ -25,47 +25,10 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-
 #include <Hadrons/Modules/MGauge/Random.hpp>

 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;

-/******************************************************************************
-*                           TRandom implementation                            *
-******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-TRandom::TRandom(const std::string name)
-: Module<NoPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TRandom::getInput(void)
-{
-    std::vector<std::string> in;
-    
-    return in;
-}
-
-std::vector<std::string> TRandom::getOutput(void)
-{
-    std::vector<std::string> out = {getName()};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-void TRandom::setup(void)
-{
-    envCreateLat(LatticeGaugeField, getName());
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-void TRandom::execute(void)
-{
-    LOG(Message) << "Generating random gauge configuration" << std::endl;
-    
-    auto &U = envGet(LatticeGaugeField, getName());
-    SU3::HotConfiguration(rng4d(), U);
-}
+template class Grid::Hadrons::MGauge::TRandom<GIMPL>;
--- a/Hadrons/Modules/MGauge/Random.hpp
+++ b/Hadrons/Modules/MGauge/Random.hpp
@@ -40,8 +40,11 @@ BEGIN_HADRONS_NAMESPACE
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)

+template <typename GImpl>
 class TRandom: public Module<NoPar>
 {
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
 public:
    // constructor
    TRandom(const std::string name);
@@ -57,7 +60,50 @@ protected:
    virtual void execute(void);
 };

-MODULE_REGISTER(Random, TRandom, MGauge);
+MODULE_REGISTER_TMP(Random, TRandom<GIMPL>, MGauge);
+
+/******************************************************************************
+*                           TRandom implementation                            *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename GImpl>
+TRandom<GImpl>::TRandom(const std::string name)
+: Module<NoPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename GImpl>
+std::vector<std::string> TRandom<GImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename GImpl>
+std::vector<std::string> TRandom<GImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TRandom<GImpl>::setup(void)
+{
+    envCreateLat(GaugeField, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TRandom<GImpl>::execute(void)
+{
+    LOG(Message) << "Generating random gauge configuration" << std::endl;
+    
+    auto &U = envGet(GaugeField, getName());
+    GImpl::HotConfiguration(rng4d(), U);
+}

 END_MODULE_NAMESPACE

--- a/Hadrons/Modules/MGauge/StoutSmearing.hpp
+++ b/Hadrons/Modules/MGauge/StoutSmearing.hpp
@@ -52,7 +52,7 @@ template <typename GImpl>
 class TStoutSmearing: public Module<StoutSmearingPar>
 {
 public:
-    typedef typename GImpl::Field GaugeField;
+    GAUGE_TYPE_ALIASES(GImpl,);
 public:
    // constructor
    TStoutSmearing(const std::string name);
--- a/Hadrons/Modules/MGauge/Unit.cc
+++ b/Hadrons/Modules/MGauge/Unit.cc
@@ -25,45 +25,10 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-
 #include <Hadrons/Modules/MGauge/Unit.hpp>

 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;

-/******************************************************************************
-*                            TUnit implementation                             *
-******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-TUnit::TUnit(const std::string name)
-: Module<NoPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TUnit::getInput(void)
-{
-    return std::vector<std::string>();
-}
-
-std::vector<std::string> TUnit::getOutput(void)
-{
-    std::vector<std::string> out = {getName()};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-void TUnit::setup(void)
-{
-    envCreateLat(LatticeGaugeField, getName());
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-void TUnit::execute(void)
-{
-    LOG(Message) << "Creating unit gauge configuration" << std::endl;
-    
-    auto &U = envGet(LatticeGaugeField, getName());
-    SU3::ColdConfiguration(rng4d(), U);
-}
+template class Grid::Hadrons::MGauge::TUnit<GIMPL>;
--- a/Hadrons/Modules/MGauge/Unit.hpp
+++ b/Hadrons/Modules/MGauge/Unit.hpp
@@ -40,8 +40,11 @@ BEGIN_HADRONS_NAMESPACE
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)

+template <typename GImpl>
 class TUnit: public Module<NoPar>
 {
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
 public:
    // constructor
    TUnit(const std::string name);
@@ -57,7 +60,48 @@ protected:
    virtual void execute(void);
 };

-MODULE_REGISTER(Unit, TUnit, MGauge);
+MODULE_REGISTER_TMP(Unit, TUnit<GIMPL>, MGauge);
+
+/******************************************************************************
+*                            TUnit implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename GImpl>
+TUnit<GImpl>::TUnit(const std::string name)
+: Module<NoPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename GImpl>
+std::vector<std::string> TUnit<GImpl>::getInput(void)
+{
+    return std::vector<std::string>();
+}
+
+template <typename GImpl>
+std::vector<std::string> TUnit<GImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TUnit<GImpl>::setup(void)
+{
+    envCreateLat(GaugeField, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TUnit<GImpl>::execute(void)
+{
+    LOG(Message) << "Creating unit gauge configuration" << std::endl;
+    
+    auto &U = envGet(GaugeField, getName());
+    GImpl::ColdConfiguration(rng4d(), U);
+}

 END_MODULE_NAMESPACE

--- a/Hadrons/Modules/MIO/LoadEigenPack.cc
+++ b/Hadrons/Modules/MIO/LoadEigenPack.cc
@@ -32,4 +32,4 @@ using namespace Hadrons;
 using namespace MIO;

 template class Grid::Hadrons::MIO::TLoadEigenPack<FermionEigenPack<FIMPL>>;
-
+template class Grid::Hadrons::MIO::TLoadEigenPack<FermionEigenPack<FIMPL, FIMPLF>>;
--- a/Hadrons/Modules/MIO/LoadEigenPack.hpp
+++ b/Hadrons/Modules/MIO/LoadEigenPack.hpp
@@ -54,7 +54,9 @@ template <typename Pack>
 class TLoadEigenPack: public Module<LoadEigenPackPar>
 {
 public:
-    typedef EigenPack<typename Pack::Field> BasePack;
+    typedef typename Pack::Field   Field;
+    typedef typename Pack::FieldIo FieldIo;
+    typedef BaseEigenPack<Field>   BasePack;
 public:
    // constructor
    TLoadEigenPack(const std::string name);
@@ -70,6 +72,7 @@ public:
 };

 MODULE_REGISTER_TMP(LoadFermionEigenPack, TLoadEigenPack<FermionEigenPack<FIMPL>>, MIO);
+MODULE_REGISTER_TMP(LoadFermionEigenPackIo32, ARG(TLoadEigenPack<FermionEigenPack<FIMPL, FIMPLF>>), MIO);

 /******************************************************************************
 *                    TLoadEigenPack implementation                           *
@@ -101,9 +104,14 @@ std::vector<std::string> TLoadEigenPack<Pack>::getOutput(void)
 template <typename Pack>
 void TLoadEigenPack<Pack>::setup(void)
 {
-    env().createGrid(par().Ls);
+    GridBase *gridIo = nullptr;
+
+    if (typeHash<Field>() != typeHash<FieldIo>())
+    {
+        gridIo = envGetRbGrid(FieldIo, par().Ls);
+    }
    envCreateDerived(BasePack, Pack, getName(), par().Ls, par().size, 
-                     env().getRbGrid(par().Ls));
+                     envGetRbGrid(Field, par().Ls), gridIo);
 }

 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/Modules/MIO/LoadNersc.cc
+++ b/Hadrons/Modules/MIO/LoadNersc.cc
@@ -31,44 +31,4 @@ using namespace Grid;
 using namespace Hadrons;
 using namespace MIO;

-/******************************************************************************
-*                       TLoadNersc implementation                             *
-******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-TLoadNersc::TLoadNersc(const std::string name)
-: Module<LoadNerscPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TLoadNersc::getInput(void)
-{
-    std::vector<std::string> in;
-    
-    return in;
-}
-
-std::vector<std::string> TLoadNersc::getOutput(void)
-{
-    std::vector<std::string> out = {getName()};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-void TLoadNersc::setup(void)
-{
-    envCreateLat(LatticeGaugeField, getName());
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-void TLoadNersc::execute(void)
-{
-    FieldMetaData header;
-    std::string   fileName = par().file + "."
-                             + std::to_string(vm().getTrajectory());
-    LOG(Message) << "Loading NERSC configuration from file '" << fileName
-                 << "'" << std::endl;
-
-    auto &U = envGet(LatticeGaugeField, getName());
-    NerscIO::readConfiguration(U, header, fileName);
-}
+template class Grid::Hadrons::MIO::TLoadNersc<GIMPL>;
--- a/Hadrons/Modules/MIO/LoadNersc.hpp
+++ b/Hadrons/Modules/MIO/LoadNersc.hpp
@@ -46,8 +46,11 @@ public:
                                    std::string, file);
 };

+template <typename GImpl>
 class TLoadNersc: public Module<LoadNerscPar>
 {
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
 public:
    // constructor
    TLoadNersc(const std::string name);
@@ -62,7 +65,54 @@ public:
    virtual void execute(void);
 };

-MODULE_REGISTER(LoadNersc, TLoadNersc, MIO);
+MODULE_REGISTER_TMP(LoadNersc,  TLoadNersc<GIMPL>,  MIO);
+
+/******************************************************************************
+*                       TLoadNersc implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename GImpl>
+TLoadNersc<GImpl>::TLoadNersc(const std::string name)
+: Module<LoadNerscPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename GImpl>
+std::vector<std::string> TLoadNersc<GImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename GImpl>
+std::vector<std::string> TLoadNersc<GImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TLoadNersc<GImpl>::setup(void)
+{
+    envCreateLat(GaugeField, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TLoadNersc<GImpl>::execute(void)
+{
+    FieldMetaData header;
+    std::string   fileName = par().file + "."
+                             + std::to_string(vm().getTrajectory());
+    LOG(Message) << "Loading NERSC configuration from file '" << fileName
+                 << "'" << std::endl;
+
+    auto &U = envGet(GaugeField, getName());
+    NerscIO::readConfiguration(U, header, fileName);
+}

 END_MODULE_NAMESPACE

--- a/Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp
+++ b/Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp
@@ -43,6 +43,8 @@ BEGIN_MODULE_NAMESPACE(MNoise)
 template <typename FImpl>
 class TTimeDilutedSpinColorDiagonal: public Module<NoPar>
 {
+public:
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TTimeDilutedSpinColorDiagonal(const std::string name);
@@ -92,7 +94,7 @@ void TTimeDilutedSpinColorDiagonal<FImpl>::setup(void)
 {
    envCreateDerived(DilutedNoise<FImpl>, 
                     TimeDilutedSpinColorDiagonalNoise<FImpl>,
-                     getName(), 1, env().getGrid());
+                     getName(), 1, envGetGrid(FermionField));
 }

 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/Modules/MScalar/ChargedProp.hpp
+++ b/Hadrons/Modules/MScalar/ChargedProp.hpp
@@ -55,7 +55,7 @@ public:
 class TChargedProp: public Module<ChargedPropPar>
 {
 public:
-    SCALAR_TYPE_ALIASES(SIMPL,);
+    BASIC_TYPE_ALIASES(SIMPL,);
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
    class Result: Serializable
--- a/Hadrons/Modules/MScalar/FreeProp.hpp
+++ b/Hadrons/Modules/MScalar/FreeProp.hpp
@@ -51,7 +51,7 @@ public:
 class TFreeProp: public Module<FreePropPar>
 {
 public:
-    SCALAR_TYPE_ALIASES(SIMPL,);
+    BASIC_TYPE_ALIASES(SIMPL,);
 public:
    // constructor
    TFreeProp(const std::string name);
--- a/Hadrons/Modules/MScalar/ScalarVP.hpp
+++ b/Hadrons/Modules/MScalar/ScalarVP.hpp
@@ -53,7 +53,7 @@ public:
 class TScalarVP: public Module<ScalarVPPar>
 {
 public:
-    SCALAR_TYPE_ALIASES(SIMPL,);
+    BASIC_TYPE_ALIASES(SIMPL,);
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
    class Result: Serializable
--- a/Hadrons/Modules/MScalar/VPCounterTerms.hpp
+++ b/Hadrons/Modules/MScalar/VPCounterTerms.hpp
@@ -53,7 +53,7 @@ public:
 class TVPCounterTerms: public Module<VPCounterTermsPar>
 {
 public:
-    SCALAR_TYPE_ALIASES(SIMPL,);
+    BASIC_TYPE_ALIASES(SIMPL,);
    class Result: Serializable
    {
    public:
--- a/Hadrons/Modules/MScalarSUN/StochFreeField.hpp
+++ b/Hadrons/Modules/MScalarSUN/StochFreeField.hpp
@@ -132,7 +132,7 @@ void TStochFreeField<SImpl>::execute(void)
    auto    &w                = envGet(ComplexField, "_" + getName() + "_weight");
    auto    &rng              = rng4d();
    double  trphi2;
-    FFT     fft(env().getGrid());
+    FFT     fft(envGetGrid(Field));
    Integer vol;

    vol = 1;
@@ -169,11 +169,6 @@ void TStochFreeField<SImpl>::execute(void)
    phi = 0.5*(phi - adj(phi));
    trphi2 = -TensorRemove(sum(trace(phi*phi))).real()/vol;
    LOG(Message) << "tr(phi^2)= " << trphi2 << std::endl;
-
-    // ComplexField phi2(env().getGrid());
-
-    // phi2=trace(phi*phi);
-    // std::cout << phi2 << std::endl;
 }

 END_MODULE_NAMESPACE
--- a/Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp
+++ b/Hadrons/Modules/MScalarSUN/TimeMomProbe.hpp
@@ -146,7 +146,7 @@ void TTimeMomProbe<SImpl>::execute(void)
    std::set<std::vector<int>>                   timeMomSet;
    std::vector<std::vector<std::vector<int>>>   timeMom;
    std::vector<std::vector<int>>                transferMom;
-    FFT                                          fft(env().getGrid());
+    FFT                                          fft(envGetGrid(Field));
    std::vector<int>                             dMask(nd, 1);
    std::vector<TimeMomProbeResult>              result;
    std::map<std::string, std::vector<SlicedOp>> slicedOp;
--- a/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
+++ b/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
@@ -126,7 +126,7 @@ void TTrKinetic<SImpl>::setup(void)
        envCreateLat(ComplexField, varName(getName(), mu, nu));
    }
    envCreateLat(ComplexField, varName(getName(), "sum"));
-    envTmp(std::vector<Field>, "der", 1, env().getNd(), env().getGrid());
+    envTmp(std::vector<Field>, "der", 1, env().getNd(), envGetGrid(Field));
 }

 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
+++ b/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
@@ -168,7 +168,7 @@ void TTwoPoint<SImpl>::execute(void)
    std::set<std::string>                        ops;
    std::vector<TwoPointResult>                  result;
    std::map<std::string, std::vector<SlicedOp>> slicedOp;
-    FFT                                          fft(env().getGrid());
+    FFT                                          fft(envGetGrid(Field));
    TComplex                                     buf;

    envGetTmp(ComplexField, ftBuf);
--- a/Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp
+++ b/Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp
@@ -136,7 +136,7 @@ void TTwoPointNPR<SImpl>::execute(void)
    const unsigned int             nd   = env().getNd();
    const unsigned int             nl   = env().getDim(0);
    const Real                     invV = 1./env().getVolume();
-    FFT                            fft(env().getGrid());
+    FFT                            fft(envGetGrid(Field));
    std::vector<TwoPointNPRResult> result;
    TwoPointNPRResult              twoPtp1, twoPtp2, twoPtDisc;
    auto                           &phi    = envGet(Field, par().field);
--- a/Hadrons/Modules/MSink/Point.hpp
+++ b/Hadrons/Modules/MSink/Point.hpp
@@ -53,7 +53,7 @@ template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl,);
+    BASIC_TYPE_ALIASES(FImpl,);
    SINK_TYPE_ALIASES();
 public:
    // constructor
@@ -132,7 +132,7 @@ void TPoint<FImpl>::execute(void)
        for(unsigned int mu = 0; mu < p.size(); mu++)
        {
            LatticeCoordinate(coor, mu);
-            ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
+            ph = ph + (p[mu]/env().getDim(mu))*coor;
        }
        ph = exp((Real)(2*M_PI)*i*ph);
        hasPhase_ = true;
--- a/Hadrons/Modules/MSolver/A2AVectors.cc
+++ b/Hadrons/Modules/MSolver/A2AVectors.cc
@@ -32,5 +32,5 @@ using namespace Grid;
 using namespace Hadrons;
 using namespace MSolver;

-template class Grid::Hadrons::MSolver::TA2AVectors<FIMPL, FermionEigenPack<FIMPL>>;
-template class Grid::Hadrons::MSolver::TA2AVectors<ZFIMPL, FermionEigenPack<ZFIMPL>>;
+template class Grid::Hadrons::MSolver::TA2AVectors<FIMPL, BaseFermionEigenPack<FIMPL>>;
+template class Grid::Hadrons::MSolver::TA2AVectors<ZFIMPL, BaseFermionEigenPack<ZFIMPL>>;
--- a/Hadrons/Modules/MSolver/A2AVectors.hpp
+++ b/Hadrons/Modules/MSolver/A2AVectors.hpp
@@ -79,9 +79,9 @@ private:
 };

 MODULE_REGISTER_TMP(A2AVectors, 
-    ARG(TA2AVectors<FIMPL, FermionEigenPack<FIMPL>>), MSolver);
+    ARG(TA2AVectors<FIMPL, BaseFermionEigenPack<FIMPL>>), MSolver);
 MODULE_REGISTER_TMP(ZA2AVectors, 
-    ARG(TA2AVectors<ZFIMPL, FermionEigenPack<ZFIMPL>>), MSolver);
+    ARG(TA2AVectors<ZFIMPL, BaseFermionEigenPack<ZFIMPL>>), MSolver);

 /******************************************************************************
 *                       TA2AVectors implementation                           *
@@ -135,9 +135,9 @@ void TA2AVectors<FImpl, Pack>::setup(void)
        Nl_ = epack.evec.size();
    }
    envCreate(std::vector<FermionField>, getName() + "_v", 1, 
-              Nl_ + noise.size(), FermionField(env().getGrid()));
+              Nl_ + noise.size(), envGetGrid(FermionField));
    envCreate(std::vector<FermionField>, getName() + "_w", 1, 
-              Nl_ + noise.size(), FermionField(env().getGrid()));
+              Nl_ + noise.size(), envGetGrid(FermionField));
    if (Ls > 1)
    {
        envTmpLat(FermionField, "f5", Ls);
--- a/Hadrons/Modules/MSolver/Guesser.hpp
+++ b/Hadrons/Modules/MSolver/Guesser.hpp
@@ -39,7 +39,7 @@ std::shared_ptr<LinearFunction<typename FImpl::FermionField>>
 makeGuesser(const std::string epackName)
 {
    typedef typename FImpl::FermionField                  FermionField;
-    typedef FermionEigenPack<FImpl>                       EPack;
+    typedef BaseFermionEigenPack<FImpl>                   EPack;
    typedef CoarseFermionEigenPack<FImpl, nBasis>         CoarseEPack;
    typedef DeflatedGuesser<FermionField>                 FineGuesser;
    typedef LocalCoherenceDeflatedGuesser<
--- a/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
+++ b/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
@@ -63,7 +63,7 @@ public:
    typedef LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
                                  typename FImpl::SiteComplex, 
                                  nBasis>                LCL;
-    typedef FermionEigenPack<FImpl>                      BasePack;
+    typedef BaseFermionEigenPack<FImpl>                  BasePack;
    typedef CoarseFermionEigenPack<FImpl, nBasis>        CoarsePack;
    typedef HADRONS_DEFAULT_SCHUR_OP<FMat, FermionField> SchurFMat;
 public:
--- a/Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp
+++ b/Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp
@@ -58,8 +58,8 @@ template <typename FImplInner, typename FImplOuter, int nBasis>
 class TMixedPrecisionRBPrecCG: public Module<MixedPrecisionRBPrecCGPar>
 {
 public:
-    FG_TYPE_ALIASES(FImplInner, Inner);
-    FG_TYPE_ALIASES(FImplOuter, Outer);
+    FERM_TYPE_ALIASES(FImplInner, Inner);
+    FERM_TYPE_ALIASES(FImplOuter, Outer);
    SOLVER_TYPE_ALIASES(FImplOuter,);
    typedef HADRONS_DEFAULT_SCHUR_OP<FMatInner, FermionFieldInner> SchurFMatInner;
    typedef HADRONS_DEFAULT_SCHUR_OP<FMatOuter, FermionFieldOuter> SchurFMatOuter;
@@ -170,7 +170,7 @@ void TMixedPrecisionRBPrecCG<FImplInner, FImplOuter, nBasis>
            MixedPrecisionConjugateGradient<FermionFieldOuter, FermionFieldInner> 
                mpcg(par().residual, par().maxInnerIteration, 
                     par().maxOuterIteration, 
-                     env().template getGrid<VTypeInner>(Ls),
+                     env().template getRbGrid<VTypeInner>(Ls),
                     simat, somat);
            OperatorFunctionWrapper<FermionFieldOuter> wmpcg(mpcg);
            HADRONS_DEFAULT_SCHUR_SOLVE<FermionFieldOuter> schurSolver(wmpcg);
--- a/Hadrons/Modules/MSolver/RBPrecCG.hpp
+++ b/Hadrons/Modules/MSolver/RBPrecCG.hpp
@@ -58,7 +58,7 @@ template <typename FImpl, int nBasis>
 class TRBPrecCG: public Module<RBPrecCGPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
    SOLVER_TYPE_ALIASES(FImpl,);
 public:
    // constructor
--- a/Hadrons/Modules/MSource/Point.hpp
+++ b/Hadrons/Modules/MSource/Point.hpp
@@ -63,7 +63,7 @@ template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl,);
+    BASIC_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TPoint(const std::string name);
@@ -126,6 +126,11 @@ void TPoint<FImpl>::execute(void)
    auto             &src     = envGet(PropagatorField, getName());
    SitePropagator   id;
    
+    if (position.size() != env().getNd())
+    {
+        HADRONS_ERROR(Size, "position has " + std::to_string(position.size())
+                      + " components (must have " + std::to_string(env().getNd()) + ")");
+    }
    id  = 1.;
    src = zero;
    pokeSite(id, src, position);
--- a/Hadrons/Modules/MSource/SeqConserved.hpp
+++ b/Hadrons/Modules/MSource/SeqConserved.hpp
@@ -187,7 +187,7 @@ void TSeqConserved<FImpl>::execute(void)
        for(unsigned int mu = 0; mu < env().getNd(); mu++)
        {
            LatticeCoordinate(coor, mu);
-            mom_phase = mom_phase + (mom[mu]/env().getGrid()->_fdimensions[mu])*coor;
+            mom_phase = mom_phase + (mom[mu]/env().getDim(mu))*coor;
        }
        mom_phase = exp((Real)(2*M_PI)*i*mom_phase);
        SeqhasPhase_ = true;
--- a/Hadrons/Modules/MSource/SeqGamma.hpp
+++ b/Hadrons/Modules/MSource/SeqGamma.hpp
@@ -71,7 +71,7 @@ template <typename FImpl>
 class TSeqGamma: public Module<SeqGammaPar>
 {
 public:
-    FG_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TSeqGamma(const std::string name);
@@ -125,7 +125,7 @@ template <typename FImpl>
 void TSeqGamma<FImpl>::setup(void)
 {
    envCreateLat(PropagatorField, getName());
-    envCacheLat(Lattice<iScalar<vInteger>>, tName_);
+    envCache(Lattice<iScalar<vInteger>>, tName_, 1, envGetGrid(LatticeComplex));
    envCacheLat(LatticeComplex, momphName_);
    envTmpLat(LatticeComplex, "coor");
 }
@@ -162,7 +162,7 @@ void TSeqGamma<FImpl>::execute(void)
        for(unsigned int mu = 0; mu < env().getNd(); mu++)
        {
            LatticeCoordinate(coor, mu);
-            ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
+            ph = ph + (p[mu]/env().getDim(mu))*coor;
        }
        ph = exp((Real)(2*M_PI)*i*ph);
        LatticeCoordinate(t, Tp);
--- a/Hadrons/Modules/MSource/Wall.hpp
+++ b/Hadrons/Modules/MSource/Wall.hpp
@@ -143,7 +143,7 @@ void TWall<FImpl>::execute(void)
        for(unsigned int mu = 0; mu < env().getNd(); mu++)
        {
            LatticeCoordinate(coor, mu);
-            ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
+            ph = ph + (p[mu]/env().getDim(mu))*coor;
        }
        ph = exp((Real)(2*M_PI)*i*ph);
        LatticeCoordinate(t, Tp);
--- a/Hadrons/Modules/MSource/Z2.hpp
+++ b/Hadrons/Modules/MSource/Z2.hpp
@@ -66,7 +66,7 @@ template <typename FImpl>
 class TZ2: public Module<Z2Par>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl,);
+    BASIC_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TZ2(const std::string name);
@@ -120,7 +120,7 @@ template <typename FImpl>
 void TZ2<FImpl>::setup(void)
 {
    envCreateLat(PropagatorField, getName());
-    envCacheLat(Lattice<iScalar<vInteger>>, tName_);
+    envCache(Lattice<iScalar<vInteger>>, tName_, 1, envGetGrid(LatticeComplex));
    envTmpLat(LatticeComplex, "eta");
 }

--- a/Hadrons/Modules/MUtilities/PrecisionCast.cc
+++ b/Hadrons/Modules/MUtilities/PrecisionCast.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MUtilities/PrecisionCast.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MUtilities;
+
+template class Grid::Hadrons::MUtilities::TPrecisionCast<GIMPLD::GaugeField, GIMPLF::GaugeField>;
+template class Grid::Hadrons::MUtilities::TPrecisionCast<FIMPLD::FermionField, FIMPLF::FermionField>;
--- a/Hadrons/Modules/MUtilities/PrecisionCast.hpp
+++ b/Hadrons/Modules/MUtilities/PrecisionCast.hpp
@@ -0,0 +1,124 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MUtilities/PrecisionCast.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MUtilities_PrecisionCast_hpp_
+#define Hadrons_MUtilities_PrecisionCast_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                          Precision cast module                             *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MUtilities)
+
+class PrecisionCastPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(PrecisionCastPar,
+                                    std::string, field);
+};
+
+template <typename FieldIn, typename FieldOut>
+class TPrecisionCast: public Module<PrecisionCastPar>
+{
+public:
+    // constructor
+    TPrecisionCast(const std::string name);
+    // destructor
+    virtual ~TPrecisionCast(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(GaugeSinglePrecisionCast, 
+                    ARG(TPrecisionCast<GIMPLD::GaugeField, GIMPLF::GaugeField>),
+                    MUtilities);
+MODULE_REGISTER_TMP(FermionSinglePrecisionCast, 
+                    ARG(TPrecisionCast<FIMPLD::FermionField, FIMPLF::FermionField>),
+                    MUtilities);
+
+/******************************************************************************
+ *                     TPrecisionCast implementation                          *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FieldIn, typename FieldOut>
+TPrecisionCast<FieldIn, FieldOut>::TPrecisionCast(const std::string name)
+: Module<PrecisionCastPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FieldIn, typename FieldOut>
+std::vector<std::string> TPrecisionCast<FieldIn, FieldOut>::getInput(void)
+{
+    std::vector<std::string> in = {par().field};
+    
+    return in;
+}
+
+template <typename FieldIn, typename FieldOut>
+std::vector<std::string> TPrecisionCast<FieldIn, FieldOut>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FieldIn, typename FieldOut>
+void TPrecisionCast<FieldIn, FieldOut>::setup(void)
+{
+    envCreateLat(FieldOut, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FieldIn, typename FieldOut>
+void TPrecisionCast<FieldIn, FieldOut>::execute(void)
+{
+    LOG(Message) << "Casting field '" << par().field << "'" << std::endl;
+    LOG(Message) << "In  type: " << typeName<FieldIn>() << std::endl;
+    LOG(Message) << "Out type: " << typeName<FieldOut>() << std::endl;
+
+    auto &in  = envGet(FieldIn,  par().field);
+    auto &out = envGet(FieldOut, getName());
+
+    precisionChange(out, in);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MUtilities_PrecisionCast_hpp_
--- a/Hadrons/Modules/MUtilities/RandomVectors.hpp
+++ b/Hadrons/Modules/MUtilities/RandomVectors.hpp
@@ -96,8 +96,15 @@ std::vector<std::string> TRandomVectors<Field>::getOutput(void)
 template <typename Field>
 void TRandomVectors<Field>::setup(void)
 {
+    if (par().Ls > 1)
+    {
        envCreate(std::vector<Field>, getName(), par().Ls, par().size, 
-              env().getGrid(par().Ls));
+                  envGetGrid(Field, par().Ls));
+    }
+    else
+    {
+        envCreate(std::vector<Field>, getName(), 1, par().size, envGetGrid(Field));
+    }
 }

 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/TimerArray.cc
+++ b/Hadrons/TimerArray.cc
@@ -0,0 +1,99 @@
+#include <Hadrons/TimerArray.hpp>
+
+using namespace Grid;
+using namespace QCD;
+using namespace Hadrons;
+
+void TimerArray::startTimer(const std::string &name)
+{
+    if (!name.empty())
+    {
+        timer_[name].Start();
+    }
+}
+
+GridTime TimerArray::getTimer(const std::string &name)
+{
+    GridTime t;
+    
+    if (!name.empty())
+    {
+        try
+        {
+            bool running = timer_.at(name).isRunning();
+
+            if (running) stopTimer(name);
+            t = timer_.at(name).Elapsed();
+            if (running) startTimer(name);
+        }
+        catch (std::out_of_range &)
+        {
+            t = GridTime::zero();
+        }
+    }
+    else
+    {
+        t = GridTime::zero();
+    }
+
+    return t;
+}
+
+double TimerArray::getDTimer(const std::string &name)
+{
+    return static_cast<double>(getTimer(name).count());
+}
+
+void TimerArray::startCurrentTimer(const std::string &name)
+{
+    if (!name.empty())
+    {
+        stopCurrentTimer();
+        startTimer(name);
+        currentTimer_ = name;
+    }
+}
+
+void TimerArray::stopTimer(const std::string &name)
+{
+    if (timer_.at(name).isRunning())
+    {
+        timer_.at(name).Stop();
+    }
+}
+
+void TimerArray::stopCurrentTimer(void)
+{
+    if (!currentTimer_.empty())
+    {
+        stopTimer(currentTimer_);
+        currentTimer_ = "";
+    }
+}
+
+void TimerArray::stopAllTimers(void)
+{
+    for (auto &t: timer_)
+    {
+        stopTimer(t.first);
+    }
+    currentTimer_ = "";
+}
+
+void TimerArray::resetTimers(void)
+{
+    timer_.clear();
+    currentTimer_ = "";
+}
+
+std::map<std::string, GridTime> TimerArray::getTimings(void)
+{
+    std::map<std::string, GridTime> timing;
+
+    for (auto &t: timer_)
+    {
+        timing[t.first] = t.second.Elapsed();
+    }
+
+    return timing;
+}
--- a/Hadrons/TimerArray.hpp
+++ b/Hadrons/TimerArray.hpp
@@ -0,0 +1,29 @@
+#ifndef Hadrons_TimerArray_hpp_
+#define Hadrons_TimerArray_hpp_
+
+#include <Hadrons/Global.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+class TimerArray
+{
+public:
+    TimerArray(void) = default;
+    virtual ~TimerArray(void) = default;
+    void                            startTimer(const std::string &name);
+    GridTime                        getTimer(const std::string &name);
+    double                          getDTimer(const std::string &name);
+    void                            startCurrentTimer(const std::string &name);
+    void                            stopTimer(const std::string &name);
+    void                            stopCurrentTimer(void);
+    void                            stopAllTimers(void);
+    void                            resetTimers(void);
+    std::map<std::string, GridTime> getTimings(void);
+private:
+    std::string                          currentTimer_;
+    std::map<std::string, GridStopWatch> timer_; 
+};
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_TimerArray_hpp_
--- a/Hadrons/Utilities/EigenPackCast.cc
+++ b/Hadrons/Utilities/EigenPackCast.cc
@@ -0,0 +1,164 @@
+#include <Hadrons/EigenPack.hpp>
+#include <Hadrons/Environment.hpp>
+
+using namespace Grid;
+using namespace QCD;
+using namespace Hadrons;
+
+template <typename FOut, typename FIn>
+void convert(const std::string outFilename, const std::string inFilename, 
+             const unsigned int Ls, const bool rb, const unsigned int size, 
+             const bool multiFile)
+{
+    assert(outFilename != inFilename);
+    
+    typedef EigenPack<FOut>            EPOut;
+    typedef EigenPack<FIn>             EPIn;
+    typedef typename FOut::vector_type VTypeOut;
+    typedef typename FIn::vector_type  VTypeIn;
+
+    std::shared_ptr<GridCartesian>         gInBase, gOutBase, gIn5, gOut5;
+    std::shared_ptr<GridRedBlackCartesian> rbgIn, rbgOut;
+    GridBase                               *gIn, *gOut;
+
+    auto         dim     = GridDefaultLatt();
+    unsigned int nd      = dim.size();
+    auto         simdOut = GridDefaultSimd(nd, VTypeOut::Nsimd());
+    auto         simdIn  = GridDefaultSimd(nd, VTypeIn::Nsimd());
+
+    gOutBase.reset(SpaceTimeGrid::makeFourDimGrid(dim, simdOut, GridDefaultMpi()));
+    gInBase.reset(SpaceTimeGrid::makeFourDimGrid(dim, simdIn, GridDefaultMpi()));
+    if (rb)
+    {
+        if (Ls > 1)
+        {
+            rbgOut.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gOutBase.get()));
+            rbgIn.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gInBase.get()));
+        }
+        else
+        {
+            rbgOut.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gOutBase.get()));
+            rbgIn.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gInBase.get()));
+        }
+        gOut = rbgOut.get();
+        gIn  = rbgIn.get();
+    }
+    else
+    {
+        if (Ls > 1)
+        {
+            gOut5.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gOutBase.get()));
+            gIn5.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gInBase.get()));
+            gOut = gOut5.get();
+            gIn  = gIn5.get();
+        }
+        else
+        {
+            gOut = gOutBase.get();
+            gIn  = gInBase.get();
+        }
+    }
+
+    FOut         bufOut(gOut);
+    FIn          bufIn(gIn), testIn(gIn);
+    ScidacWriter binWriter(gOut->IsBoss());
+    ScidacReader binReader;
+    PackRecord   record;
+    RealD        eval;
+
+    LOG(Message) << "==== EIGENPACK CONVERSION" << std::endl;
+    LOG(Message) << "Lattice       : " << gIn->GlobalDimensions() << std::endl;
+    LOG(Message) << "Checkerboarded: " << (rb ? "yes" : "no") << std::endl;
+    LOG(Message) << "In path       : " << inFilename  << std::endl;
+    LOG(Message) << "In type       : " << typeName<FIn>() << std::endl;
+    LOG(Message) << "Out path      : " << outFilename << std::endl;
+    LOG(Message) << "Out type      : " << typeName<FOut>() << std::endl;
+    LOG(Message) << "#vectors      : " << size << std::endl;
+    LOG(Message) << "Multifile     : " << (multiFile ? "yes" : "no") << std::endl;
+    if (multiFile)
+    {
+        for(unsigned int k = 0; k < size; ++k)
+        {
+            std::string  outV = outFilename + "/v" + std::to_string(k) + ".bin";
+            std::string  inV  = inFilename + "/v" + std::to_string(k) + ".bin";
+
+            LOG(Message) << "==== Converting vector " << k << std::endl;
+            LOG(Message) << "In : " << inV  << std::endl;
+            LOG(Message) << "Out: " << outV << std::endl;
+            makeFileDir(outV, gOut);
+            binWriter.open(outV);
+            binReader.open(inV);
+            EigenPackIo::readHeader(record, binReader);
+            EigenPackIo::writeHeader(binWriter, record);
+            EigenPackIo::readElement<FIn>(bufIn, eval, k, binReader);
+            EigenPackIo::writeElement<FIn, FOut>(binWriter, bufIn, eval, k, &bufOut, &testIn);
+            binWriter.close();
+            binReader.close();
+        }
+    }
+    else
+    {
+        makeFileDir(outFilename, gOut);
+        binWriter.open(outFilename);
+        binReader.open(inFilename);
+        EigenPackIo::readHeader(record, binReader);
+        EigenPackIo::writeHeader(binWriter, record);
+        for(unsigned int k = 0; k < size; ++k)
+        {
+            EigenPackIo::readElement<FIn>(bufIn, eval, k, binReader);
+            EigenPackIo::writeElement<FIn, FOut>(binWriter, bufIn, eval, k, &bufOut, &testIn);
+        }
+        binWriter.close();
+        binReader.close();
+    }
+}
+
+#ifndef FOUT
+#warning "FOUT undefined (set to WilsonImplF::FermionField by default)"
+#define FOUT WilsonImplF::FermionField
+#endif
+#ifndef FIN
+#warning "FIN undefined (set to WilsonImplD::FermionField by default)"
+#define FIN WilsonImplD::FermionField
+#endif
+
+int main(int argc, char *argv[])
+{
+    // parse command line
+    std::string  outFilename, inFilename;
+    unsigned int size, Ls;
+    bool         rb, multiFile;
+    
+    if (argc < 7)
+    {
+        std::cerr << "usage: " << argv[0] << " <out eigenpack> <in eigenpack> <Ls> <red-black (0|1)> <#vector> <multifile (0|1)> [Grid options]";
+        std::cerr << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+    outFilename = argv[1];
+    inFilename  = argv[2];
+    Ls          = std::stoi(std::string(argv[3]));
+    rb          = (std::string(argv[4]) != "0");
+    size        = std::stoi(std::string(argv[5]));
+    multiFile   = (std::string(argv[6]) != "0");
+    
+    // initialization
+    Grid_init(&argc, &argv);
+    initLogger();
+
+    // execution
+    try
+    {
+        convert<FOUT, FIN>(outFilename, inFilename, Ls, rb, size, multiFile);
+    }
+    catch (const std::exception& e)
+    {
+        Exceptions::abort(e);
+    }
+
+    // epilogue
+    LOG(Message) << "Grid is finalizing now" << std::endl;
+    Grid_finalize();
+    
+    return EXIT_SUCCESS;
+}
--- a/Hadrons/Utilities/HadronsXmlRun.cc
+++ b/Hadrons/Utilities/HadronsXmlRun.cc
--- a/Hadrons/Utilities/Makefile.am
+++ b/Hadrons/Utilities/Makefile.am
@@ -0,0 +1,8 @@
+bin_PROGRAMS = HadronsXmlRun HadronsFermionEP64To32
+
+HadronsXmlRun_SOURCES = HadronsXmlRun.cc
+HadronsXmlRun_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
+
+HadronsFermionEP64To32_SOURCES  = EigenPackCast.cc
+HadronsFermionEP64To32_CXXFLAGS = $(AM_CXXFLAGS) -DFIN=WilsonImplD::FermionField -DFOUT=WilsonImplF::FermionField
+HadronsFermionEP64To32_LDADD    = ../libHadrons.a ../../Grid/libGrid.a
--- a/Hadrons/modules.inc
+++ b/Hadrons/modules.inc
@@ -4,6 +4,7 @@ modules_cc =\
  Modules/MContraction/Meson.cc \
  Modules/MContraction/WeakNeutral4ptDisc.cc \
  Modules/MContraction/WeakHamiltonianNonEye.cc \
+  Modules/MContraction/A2AAslashField.cc \
  Modules/MContraction/WardIdentity.cc \
  Modules/MContraction/A2AMesonField.cc \
  Modules/MContraction/DiscLoop.cc \
@@ -30,6 +31,7 @@ modules_cc =\
  Modules/MNoise/TimeDilutedSpinColorDiagonal.cc \
  Modules/MUtilities/RandomVectors.cc \
  Modules/MUtilities/TestSeqGamma.cc \
+  Modules/MUtilities/PrecisionCast.cc \
  Modules/MUtilities/TestSeqConserved.cc \
  Modules/MLoop/NoiseLoop.cc \
  Modules/MScalar/FreeProp.cc \
@@ -62,8 +64,8 @@ modules_cc =\

 modules_hpp =\
  Modules/MContraction/Baryon.hpp \
+  Modules/MContraction/A2AAslashField.hpp \
  Modules/MContraction/A2AMesonField.hpp \
-  Modules/MContraction/A2AMesonFieldKernels.hpp \
  Modules/MContraction/Meson.hpp \
  Modules/MContraction/WeakHamiltonian.hpp \
  Modules/MContraction/WeakHamiltonianNonEye.hpp \
@@ -93,6 +95,7 @@ modules_hpp =\
  Modules/MGauge/FundtoHirep.hpp \
  Modules/MGauge/StochEm.hpp \
  Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp \
+  Modules/MUtilities/PrecisionCast.hpp \
  Modules/MUtilities/RandomVectors.hpp \
  Modules/MUtilities/TestSeqGamma.hpp \
  Modules/MUtilities/TestSeqConserved.hpp \
--- a/configure.ac
+++ b/configure.ac
@@ -562,6 +562,7 @@ AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(tests/testu01/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_CONFIG_FILES(Hadrons/Makefile)
+AC_CONFIG_FILES(Hadrons/Utilities/Makefile)
 AC_OUTPUT

 echo ""
--- a/documentation/Grid.pdf
+++ b/documentation/Grid.pdf
--- a/documentation/_build/latex/Grid.pdf
+++ b/documentation/_build/latex/Grid.pdf
--- a/documentation/conf.py
+++ b/documentation/conf.py
@@ -80,7 +80,8 @@ primary_domain = 'cpp'
 # a list of builtin themes.
 #
 html_theme = 'alabaster'
-
+html_use_smartypants = False
+smart_quotes = False
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
--- a/documentation/interfacing.rst
+++ b/documentation/interfacing.rst
@@ -0,0 +1,232 @@
+Interfacing with external software
+========================================
+
+Grid provides a number of important modules, such as solvers and
+eigensolvers, that are highly optimized for complex vector/SIMD
+architectures, such as the Intel Xeon Phi KNL and Skylake processors.
+This growing library, with appropriate interfacing, can be accessed
+from existing code. Here we describe interfacing issues and provide
+examples.
+
+	  
+MPI initialization
+--------------------
+
+Grid supports threaded MPI sends and receives and, if running with
+more than one thread, requires the MPI_THREAD_MULTIPLE mode of message
+passing. If the user initializes MPI before starting Grid, the
+appropriate initialization call is::
+
+  MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &provided);
+  assert(MPI_THREAD_MULTIPLE == provided);
+
+Grid Initialization
+---------------------
+
+Grid itself is initialized with a call::
+
+  Grid_init(&argc, &argv);
+
+Command line options include::
+
+  --mpi n.n.n.n   : default MPI decomposition
+  --threads n     : default number of OMP threads
+  --grid n.n.n.n  : default Grid size
+  
+where `argc` and `argv` are constructed to simulate the command-line
+options described above.  At a minimum one usually provides the
+`--grid` and `--mpi` parameters.  The former specifies the lattice
+dimensions and the latter specifies the grid of processors (MPI
+ranks).  If these parameters are not specified with the `Grid_init`
+call, they need to be supplied later when creating Grid fields.
+
+The following Grid procedures are useful for verifying that Grid
+"default" values are properly initialized.
+
+=============================================================   ===========================================================================================================
+  Grid procedure                                                  returns 
+=============================================================   ===========================================================================================================
+  std::vector<int> GridDefaultLatt();                            lattice size
+  std::vector<int> GridDefaultSimd(int Nd,vComplex::Nsimd());    SIMD layout
+  std::vector<int> GridDefaultMpi();                             MPI layout
+  int Grid::GridThread::GetThreads();                            number of threads
+=============================================================   ===========================================================================================================
+
+
+MPI coordination
+----------------
+
+Grid wants to use its own numbering of MPI ranks and its own
+assignment of the lattice coordinates with each rank.  Obviously, the
+calling program and Grid must agree on these conventions.  One should
+use Grid's Cartesian communicator class to discover the processor
+assignments. For a four-dimensional processor grid one can define::
+
+  static Grid::CartesianCommunicator *grid_cart = NULL;
+  grid_cart = new Grid::CartesianCommunicator(processors);
+
+where `processors` is of type `std::vector<int>`, with values matching
+the MPI processor-layout dimensions specified with the `--mpi`
+argument in the `Grid_Init` call.  Then each MPI rank can obtain its
+processor coordinate using the Cartesian communicator instantiated
+above.  For example, in four dimensions::
+
+  std::vector<int> pePos(4);    
+  for(int i=0; i<4; i++)
+     pePos[i] = grid_cart->_processor_coor[i];
+
+and each MPI process can get its world rank from its processor
+coordinates using::
+
+  int peRank = grid_cart->RankFromProcessorCoor(pePos)
+	  
+Conversely, each MPI process can get its processor coordinates from
+its world rank using::
+
+  grid_cart->ProcessorCoorFromRank(peRank, pePos);
+
+If the calling program initialized MPI before initializing Grid, it is
+then important for each MPI process in the calling program to reset
+its rank number so it agrees with Grid::
+
+   MPI_Comm comm;
+   MPI_Comm_split(MPI_COMM_THISJOB,jobid,peRank,&comm);
+   MPI_COMM_THISJOB = comm;
+
+where `MPI_COMM_THISJOB` is initially a copy of `MPI_COMM_WORLD` (with
+`jobid = 0`), or it is a split communicator with `jobid` equal to the
+index number of the subcommunicator.  Once this is done,::
+
+  MPI_Comm_rank(MPI_COMM_THISJOB, &myrank);
+
+returns a rank that agrees with Grid's `peRank`.
+
+QMP coordination
+----------------
+
+If the calling program uses the SciDAC QMP message-passing package, a
+call to QMP_comm_split() instead can be used to reassign the ranks.
+In the example below, `peGrid` gives the processor-grid dimensions,
+usually set on the command line with `-qmp-geom`.
+
+**Example**::
+  
+  int NDIM = QMP_get_allocated_number_of_dimensions();
+  Grid::Grid_init(argc,argv);
+  FgridBase::grid_initted=true;
+  std::vector<int> processors;
+  for(int i=0;i<NDIM;i++) processors.push_back(peGrid[i]);
+  Grid::CartesianCommunicator grid_cart(processors);
+  std::vector<int> pePos(NDIM);
+  for(int i=NDIM-1;i>=0;i--)
+     pePos[i] = grid_cart._processor_coor[i];
+  int peRank = grid_cart->RankFromProcessorCoor(pePos);
+  QMP_comm_split(QMP_comm_get_default(),0,peRank,&qmp_comm);
+  QMP_comm_set_default(qmp_comm);
+
+  
+Mapping fields between Grid and user layouts
+---------------------------------------------
+
+In order to map data between calling-program and Grid layouts, it is
+important to know how the lattice sites are distributed across the
+processor grid.  A lattice site with coordinates `r[mu]` is assigned
+to the processor with processor coordinates `pePos[mu]` according to
+the rule::
+
+  pePos[mu] = r[mu]/dim[mu]
+
+where `dim[mu]` is the lattice dimension in the `mu` direction.  For
+performance reasons, it is important that the external data layout
+follow the same rule.  Then data mapping can be done without
+requiring costly communication between ranks.  We assume this is the
+case here.
+
+When mapping data to and from Grid, one must choose a lattice object
+defined on the appropriate grid, whether it be a full lattice (4D
+`GridCartesian`), one of the checkerboards (4D
+`GridRedBlackCartesian`), a five-dimensional full grid (5D
+`GridCartesian`), or a five-dimensional checkerboard (5D
+`GridRedBlackCartesian`).  For example, an improved staggered-fermion
+color-vector field `cv` on a single checkerboard would be constructed
+using
+
+**Example**::
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian       RBGrid(&Grid);
+
+  typename ImprovedStaggeredFermion::FermionField  cv(RBGrid);
+
+The example above assumes that the grid default values were set in the
+`Grid_init` call.  If not, they can be set at this point and passed
+when `GridCartesian` is instantiated here.  To map data within an MPI
+rank, the external code must iterate over the sites belonging to that
+rank (full or checkerboard as appropriate).  Note that the site
+coordinates are specified relative to the origin of the lattice
+subvolume on that rank. To import data into Grid, the external data on
+a single site with coordinates `r` is first copied into the
+appropriate Grid scalar object `s`.  Then it is copied into the Grid
+lattice field `l` with `pokeLocalSite`::
+
+  pokeLocalSite(const sobj &s, Lattice<vobj> &l, Coordinate &r);
+
+To export data from Grid, the reverse operation starts with::
+
+  peekLocalSite(const sobj &s, Lattice<vobj> &l, Coordinate &r);
+
+and then copies the single-site data from `s` into the corresponding
+external type.
+
+Here is an example that maps a single site's worth of data in a MILC
+color-vector field to a Grid scalar ColourVector object `cVec` and from
+there to the lattice colour-vector field `cv`, as defined above.
+
+**Example**::
+
+  indexToCoords(idx,r);
+  ColourVector cVec;
+  for(int col=0; col<Nc; col++)
+      cVec()()(col) = 
+          Complex(src[idx].c[col].real, src[idx].c[col].imag);
+
+  pokeLocalSite(cVec, cv, r);
+
+Here the `indexToCoords()` function is a MILC mapping of the MILC site
+index `idx` to the 4D lattice coordinate `r`.
+
+Grid provides block- and multiple-rhs conjugate-gradient solvers. For
+this purpose it uses a 5D lattice. To map data to and from Grid data
+types, the index for the right-hand-side vector becomes the zeroth
+coordinate of a five-dimensional vector `r5`.  The remaining
+components of `r5` contain the 4D space-time coordinates.  The
+`pokeLocalSite/peekLocalSite` operations then accept the coordinate
+`r5`, provided the destination/source lattice object is also 5D.  In
+the example below data from a single site specified by `idx`,
+belonging to a set of `Ls` MILC color-vector fields, are copied into a
+Grid 5D fermion field `cv5`.
+
+**Example**::
+
+    GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt();
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid)  typename ImprovedStaggeredFermion5D::FermionField  cv5(FrbGrid);
+
+    std::vector<int> r(4);
+    indexToCoords(idx,r);
+    std::vector<int> r5(1,0);
+    for( int d = 0; d < 4; d++ ) r5.push_back(r[d]);
+
+    for( int j = 0; j < Ls; j++ ){
+      r5[0] = j;
+      ColourVector cVec;
+      for(int col=0; col<Nc; col++){
+	  cVec()()(col) = 
+	      Complex(src[j][idx].c[col].real, src[j][idx].c[col].imag);
+      }
+      pokeLocalSite(cVec, *(out->cv), r5);
+    }
+
--- a/documentation/manual.rst
+++ b/documentation/manual.rst
--- a/tests/hmc/Test_hmc_EODWFRatio.cc
+++ b/tests/hmc/Test_hmc_EODWFRatio.cc
@@ -146,13 +146,7 @@ int main(int argc, char **argv) {
  std::cout << GridLogMessage << "Denominator report, Dw(m) term (includes CG) : " << std::endl;
  DenOp.Report();

-
-
-
  Grid_finalize();

-
-
-
 } // main