Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2026-05-31 06:24:18 +01:00 · 2018-11-23 14:08:29 +00:00
parent 17b3f47b1e e232257cb6
commit b74940b3d4
25 changed files with 1506 additions and 145 deletions
@@ -487,7 +487,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int


 template<class vobj>
-void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
  typedef typename vobj::scalar_object sobj;

@@ -61,9 +61,9 @@ Group & Hdf5Writer::getGroup(void)
 }

 // Reader implementation ///////////////////////////////////////////////////////
-Hdf5Reader::Hdf5Reader(const std::string &fileName)
+Hdf5Reader::Hdf5Reader(const std::string &fileName, const bool readOnly)
 : fileName_(fileName)
-, file_(fileName.c_str(), H5F_ACC_RDWR)
+, file_(fileName.c_str(), readOnly ? H5F_ACC_RDONLY : H5F_ACC_RDWR)
 {
  group_ = file_.openGroup("/");
  readSingleAttribute(dataSetThres_, HDF5_GRID_GUARD "dataset_threshold",
@@ -54,7 +54,7 @@ namespace Grid
  class Hdf5Reader: public Reader<Hdf5Reader>
  {
  public:
-    Hdf5Reader(const std::string &fileName);
+    Hdf5Reader(const std::string &fileName, const bool readOnly = true);
    virtual ~Hdf5Reader(void) = default;
    bool push(const std::string &s);
    void pop(void);
@@ -47,6 +47,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #define PARALLEL_FOR_LOOP
 #define PARALLEL_FOR_LOOP_INTERN
+#define PARALLEL_FOR_LOOP_REDUCE(op, var)
 #define PARALLEL_NESTED_LOOP2
 #define PARALLEL_NESTED_LOOP5
 #define PARALLEL_REGION
@@ -58,6 +59,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 #define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
+#define parallel_critical PARALLEL_CRITICAL

 namespace Grid {

@@ -28,16 +28,31 @@
 extern "C" {
 #include <openssl/sha.h>
 }
+#ifdef USE_IPP
+#include "ipp.h"
+#endif

 #pragma once

 class GridChecksum
 {
 public:
-  static inline uint32_t crc32(void *data,size_t bytes)
+  static inline uint32_t crc32(const void *data, size_t bytes)
  {
    return ::crc32(0L,(unsigned char *)data,bytes);
  }
+
+#ifdef USE_IPP
+  static inline uint32_t crc32c(const void* data, size_t bytes)
+  {
+      uint32_t crc32c = ~(uint32_t)0;
+      ippsCRC32C_8u(reinterpret_cast<const unsigned char *>(data), bytes, &crc32c);
+      ippsSwapBytes_32u_I(&crc32c, 1);
+  
+      return ~crc32c;
+  }
+#endif
+
  template <typename T>
  static inline std::string sha256_string(const std::vector<T> &hash)
  {
@@ -32,11 +32,19 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Global.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
+#ifdef USE_MKL
+#include "mkl.h"
+#include "mkl_cblas.h"
+#endif

 #ifndef HADRONS_A2AM_NAME 
 #define HADRONS_A2AM_NAME "a2aMatrix"
 #endif

+#ifndef HADRONS_A2AM_IO_TYPE
+#define HADRONS_A2AM_IO_TYPE ComplexF
+#endif
+
 #define HADRONS_A2AM_PARALLEL_IO

 BEGIN_HADRONS_NAMESPACE
@@ -51,6 +59,12 @@ BEGIN_HADRONS_NAMESPACE
 template <typename T>
 using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;

+template <typename T>
+using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
+
+template <typename T>
+using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
+
 /******************************************************************************
 *                      Abstract class for A2A kernels                        *
 ******************************************************************************/
@@ -76,10 +90,15 @@ public:
    // constructors
    A2AMatrixIo(void) = default;
    A2AMatrixIo(std::string filename, std::string dataname, 
-                const unsigned int nt, const unsigned int ni,
-                const unsigned int nj);
+                const unsigned int nt, const unsigned int ni = 0,
+                const unsigned int nj = 0);
    // destructor
    ~A2AMatrixIo(void) = default;
+    // access
+    unsigned int getNi(void) const;
+    unsigned int getNj(void) const;
+    unsigned int getNt(void) const;
+    size_t       getSize(void) const;
    // file allocation
    template <typename MetadataType>
    void initFile(const MetadataType &d, const unsigned int chunkSize);
@@ -88,9 +107,11 @@ public:
                   const unsigned int blockSizei, const unsigned int blockSizej);
    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
                   const unsigned int i, const unsigned int j);
+    template <template <class> class Vec, typename VecT>
+    void load(Vec<VecT> &v, double *tRead = nullptr);
 private:
-    std::string  filename_, dataname_;
-    unsigned int nt_, ni_, nj_;
+    std::string  filename_{""}, dataname_{""};
+    unsigned int nt_{0}, ni_{0}, nj_{0};
 };

 /******************************************************************************
@@ -136,6 +157,226 @@ private:
    std::vector<IoHelper> nodeIo_;
 };

+/******************************************************************************
+ *                       A2A matrix contraction kernels                       *
+ ******************************************************************************/
+class A2AContraction
+{
+public:
+    // accTrMul(acc, a, b): acc += tr(a*b)
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
+    {
+        if ((MatLeft::Options == Eigen::RowMajor) and
+            (MatRight::Options == Eigen::ColMajor))
+        {
+            parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+            {
+                C tmp;
+#ifdef USE_MKL
+                dotuRow(tmp, r, a, b);
+#else
+                tmp = a.row(r).conjugate().dot(b.col(r));
+#endif
+                parallel_critical
+                {
+                    acc += tmp;
+                }
+            }
+        }
+        else
+        {
+            parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+            {
+                C tmp;
+#ifdef USE_MKL 
+                dotuCol(tmp, c, a, b);
+#else
+                tmp = a.col(c).conjugate().dot(b.row(c));
+#endif
+                parallel_critical
+                {
+                    acc += tmp;
+                }
+            }
+        }
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
+    {
+        double n = a.rows()*a.cols();
+
+        return 8.*n;
+    }
+
+    // mul(res, a, b): res = a*b
+#ifdef USE_MKL
+    template <template <class, int...> class Mat, int... Opts>
+    static inline void mul(Mat<ComplexD, Opts...> &res, 
+                           const Mat<ComplexD, Opts...> &a, 
+                           const Mat<ComplexD, Opts...> &b)
+    {
+        static const ComplexD one(1., 0.), zero(0., 0.);
+
+        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
+        {
+            res.resize(a.rows(), b.cols());
+        }
+        if (Mat<ComplexD, Opts...>::Options == Eigen::RowMajor)
+        {
+            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat<ComplexD, Opts...>::Options == Eigen::ColMajor)
+        {
+            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    }
+
+    template <template <class, int...> class Mat, int... Opts>
+    static inline void mul(Mat<ComplexF, Opts...> &res, 
+                           const Mat<ComplexF, Opts...> &a, 
+                           const Mat<ComplexF, Opts...> &b)
+    {
+        static const ComplexF one(1., 0.), zero(0., 0.);
+
+        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
+        {
+            res.resize(a.rows(), b.cols());
+        }
+        if (Mat<ComplexF, Opts...>::Options == Eigen::RowMajor)
+        {
+            cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat<ComplexF, Opts...>::Options == Eigen::ColMajor)
+        {
+            cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    }
+#else
+    template <typename Mat>
+    static inline void mul(Mat &res, const Mat &a, const Mat &b)
+    {
+        res = a*b;
+    }
+#endif
+    template <typename Mat>
+    static inline double mulFlops(const Mat &a, const Mat &b)
+    {
+        double nr = a.rows(), nc = a.cols();
+
+        return nr*nr*(6.*nc + 2.*(nc - 1.));
+    }
+private:
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void makeDotRowPt(C * &aPt, unsigned int &aInc, C * &bPt, 
+                                    unsigned int &bInc, const unsigned int aRow, 
+                                    const MatLeft &a, const MatRight &b)
+    {
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            aPt  = a.data() + aRow*a.cols();
+            aInc = 1;
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            aPt  = a.data() + aRow;
+            aInc = a.rows();
+        }
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            bPt  = b.data() + aRow;
+            bInc = b.cols();
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            bPt  = b.data() + aRow*b.rows();
+            bInc = 1;
+        }
+    }
+
+#ifdef USE_MKL
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void makeDotColPt(C * &aPt, unsigned int &aInc, C * &bPt, 
+                                    unsigned int &bInc, const unsigned int aCol, 
+                                    const MatLeft &a, const MatRight &b)
+    {
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            aPt  = a.data() + aCol;
+            aInc = a.cols();
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            aPt  = a.data() + aCol*a.rows();
+            aInc = 1;
+        }
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            bPt  = b.data() + aCol*b.cols();
+            bInc = 1;
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            bPt  = b.data() + aCol;
+            bInc = b.rows();
+        }
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuRow(ComplexF &res, const unsigned int aRow,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexF *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
+        cblas_cdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuCol(ComplexF &res, const unsigned int aCol,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexF *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
+        cblas_cdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuRow(ComplexD &res, const unsigned int aRow,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexD *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
+        cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuCol(ComplexD &res, const unsigned int aCol,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexD *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
+        cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+    }
+#endif
+};
+
 /******************************************************************************
 *                     A2AMatrixIo template implementation                    *
 ******************************************************************************/
@@ -148,6 +389,31 @@ A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname,
 , nt_(nt), ni_(ni), nj_(nj)
 {}

+// access //////////////////////////////////////////////////////////////////////
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNt(void) const
+{
+    return nt_;
+}
+
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNi(void) const
+{
+    return ni_;
+}
+
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNj(void) const
+{
+    return nj_;
+}
+
+template <typename T>
+size_t A2AMatrixIo<T>::getSize(void) const
+{
+    return nt_*ni_*nj_*sizeof(T);
+}
+
 // file allocation /////////////////////////////////////////////////////////////
 template <typename T>
 template <typename MetadataType>
@@ -171,7 +437,7 @@ void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSiz
    }

    // create the dataset
-    Hdf5Reader reader(filename_);
+    Hdf5Reader reader(filename_, false);

    push(reader, dataname_);
    auto &group = reader.getGroup();
@@ -191,7 +457,7 @@ void A2AMatrixIo<T>::saveBlock(const T *data,
                               const unsigned int blockSizej)
 {
 #ifdef HAVE_HDF5
-    Hdf5Reader           reader(filename_);
+    Hdf5Reader           reader(filename_, false);
    std::vector<hsize_t> count = {nt_, blockSizei, blockSizej},
                         offset = {0, static_cast<hsize_t>(i),
                                   static_cast<hsize_t>(j)},
@@ -226,6 +492,82 @@ void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
    saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
 }

+template <typename T>
+template <template <class> class Vec, typename VecT>
+void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead)
+{
+#ifdef HAVE_HDF5
+    Hdf5Reader           reader(filename_);
+    std::vector<hsize_t> hdim;
+    H5NS::DataSet        dataset;
+    H5NS::DataSpace      dataspace;
+    H5NS::CompType       datatype;
+    H5NS::DSetCreatPropList plist;
+    
+    push(reader, dataname_);
+    auto &group = reader.getGroup();
+    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
+    datatype    = dataset.getCompType();
+    dataspace   = dataset.getSpace();
+    plist       = dataset.getCreatePlist();
+    hdim.resize(dataspace.getSimpleExtentNdims());
+    dataspace.getSimpleExtentDims(hdim.data());
+    if ((nt_*ni_*nj_ != 0) and
+        ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
+    {
+        HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
+            + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
+            + std::to_string(hdim[2]) + ", expected "
+            + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
+            + std::to_string(nj_));
+    }
+    else if (ni_*nj_ == 0)
+    {
+        if (hdim[0] != nt_)
+        {
+            HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
+                + std::to_string(hdim[0]) + ", expected "
+                + std::to_string(nt_) + ")");
+        }
+        ni_ = hdim[1];
+        nj_ = hdim[2];
+    }
+
+    A2AMatrix<T>         buf(ni_, nj_);
+    std::vector<hsize_t> count    = {1, static_cast<hsize_t>(ni_),
+                                     static_cast<hsize_t>(nj_)},
+                         stride   = {1, 1, 1},
+                         block    = {1, 1, 1},
+                         memCount = {static_cast<hsize_t>(ni_),
+                                     static_cast<hsize_t>(nj_)};
+    H5NS::DataSpace      memspace(memCount.size(), memCount.data());
+
+    std::cout << "Loading timeslice";
+    std::cout.flush();
+    *tRead = 0.;
+    for (unsigned int tp1 = nt_; tp1 > 0; --tp1)
+    {
+        unsigned int         t      = tp1 - 1;
+        std::vector<hsize_t> offset = {static_cast<hsize_t>(t), 0, 0};
+        
+        if (t % 10 == 0)
+        {
+            std::cout << " " << t;
+            std::cout.flush();
+        }
+        dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
+                                  stride.data(), block.data());
+        if (tRead) *tRead -= usecond();    
+        dataset.read(buf.data(), datatype, memspace, dataspace);
+        if (tRead) *tRead += usecond();
+        v[t] = buf.template cast<VecT>();
+    }
+    std::cout << std::endl;
+#else
+    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
+#endif
+}
+
 /******************************************************************************
 *               A2AMatrixBlockComputation template implementation            *
 ******************************************************************************/
@@ -29,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define Hadrons_DiskVector_hpp_

 #include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
 #include <deque>
 #include <sys/stat.h>
 #include <ftw.h>
@@ -59,14 +60,18 @@ public:
        : master_(master), cmaster_(master), i_(i) {}

        // operator=: somebody is trying to store a vector element
-        // write to disk and cache
+        // write to cache and tag as modified
        T &operator=(const T &obj) const
        {
+            auto &cache    = *master_.cachePtr_;
+            auto &modified = *master_.modifiedPtr_;
+            auto &index    = *master_.indexPtr_;
+
            DV_DEBUG_MSG(&master_, "writing to " << i_);
            master_.cacheInsert(i_, obj);
-            master_.save(master_.filename(i_), obj);
+            modified[index.at(i_)] = true;
            
-            return master_.cachePtr_->at(i_);
+            return cache[index.at(i_)];
        }

        // implicit cast to const object reference and redirection
@@ -83,6 +88,7 @@ public:
 public:
    DiskVectorBase(const std::string dirname, const unsigned int size = 0,
                   const unsigned int cacheSize = 1, const bool clean = true);
+    DiskVectorBase(DiskVectorBase<T> &&v) = default;
    virtual ~DiskVectorBase(void);
    const T & operator[](const unsigned int i) const;
    RwAccessHelper operator[](const unsigned int i);
@@ -97,14 +103,17 @@ private:
    void cacheInsert(const unsigned int i, const T &obj) const;
    void clean(void);
 private:
-    std::string                                dirname_;
-    unsigned int                               size_, cacheSize_;
-    double                                     access_{0.}, hit_{0.};
-    bool                                       clean_;
+    std::string                                           dirname_;
+    unsigned int                                          size_, cacheSize_;
+    double                                                access_{0.}, hit_{0.};
+    bool                                                  clean_;
    // using pointers to allow modifications when class is const
    // semantic: const means data unmodified, but cache modification allowed
-    std::unique_ptr<std::map<unsigned int, T>> cachePtr_;
-    std::unique_ptr<std::deque<unsigned int>>  loadsPtr_;                
+    std::unique_ptr<std::vector<T>>                       cachePtr_;
+    std::unique_ptr<std::vector<bool>>                    modifiedPtr_;
+    std::unique_ptr<std::map<unsigned int, unsigned int>> indexPtr_;
+    std::unique_ptr<std::stack<unsigned int>>             freePtr_;
+    std::unique_ptr<std::deque<unsigned int>>             loadsPtr_;                
 };

 /******************************************************************************
@@ -135,7 +144,7 @@ private:
 *                      Specialisation for Eigen matrices                     *
 ******************************************************************************/
 template <typename T>
-using EigenDiskVectorMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+using EigenDiskVectorMat = A2AMatrix<T>;

 template <typename T>
 class EigenDiskVector: public DiskVectorBase<EigenDiskVectorMat<T>>
@@ -152,24 +161,31 @@ public:
 private:
    virtual void load(EigenDiskVectorMat<T> &obj, const std::string filename) const
    {
-        std::ifstream              f(filename, std::ios::binary);
-        std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
-        Eigen::Index               nRow, nCol;
-        size_t                     matSize;
-        double                     t;
+        std::ifstream f(filename, std::ios::binary);
+        uint32_t      crc, check;
+        Eigen::Index  nRow, nCol;
+        size_t        matSize;
+        double        tRead, tHash;

-        f.read(reinterpret_cast<char *>(hash.data()), hash.size()*sizeof(unsigned char));
-        f.read(reinterpret_cast<char *>(&nRow), sizeof(Eigen::Index));
-        f.read(reinterpret_cast<char *>(&nCol), sizeof(Eigen::Index));
+        f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
+        f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
+        f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
        obj.resize(nRow, nCol);
        matSize = nRow*nCol*sizeof(T);
-        t  = -usecond();
+        tRead  = -usecond();
        f.read(reinterpret_cast<char *>(obj.data()), matSize);
-        t += usecond();
-        DV_DEBUG_MSG(this, "Eigen read " << matSize/t*1.0e6/1024/1024 << " MB/s");
-        auto check = GridChecksum::sha256(obj.data(), matSize);
-        DV_DEBUG_MSG(this, "Eigen sha256 " << GridChecksum::sha256_string(check));
-        if (hash != check)
+        tRead += usecond();
+        tHash  = -usecond();
+#ifdef USE_IPP
+        check  = GridChecksum::crc32c(obj.data(), matSize);
+#else
+        check  = GridChecksum::crc32(obj.data(), matSize);
+#endif
+        tHash += usecond();
+        DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
+        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
+                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
+        if (crc != check)
        {
            HADRONS_ERROR(Io, "checksum failed")
        }
@@ -177,24 +193,31 @@ private:

    virtual void save(const std::string filename, const EigenDiskVectorMat<T> &obj) const
    {
-        std::ofstream              f(filename, std::ios::binary);
-        std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
-        Eigen::Index               nRow, nCol;
-        size_t                     matSize;
-        double                     t;
+        std::ofstream f(filename, std::ios::binary);
+        uint32_t      crc;
+        Eigen::Index  nRow, nCol;
+        size_t        matSize;
+        double        tWrite, tHash;
        
        nRow    = obj.rows();
        nCol    = obj.cols();
        matSize = nRow*nCol*sizeof(T);
-        hash    = GridChecksum::sha256(obj.data(), matSize);
-        DV_DEBUG_MSG(this, "Eigen sha256 " << GridChecksum::sha256_string(hash));
-        f.write(reinterpret_cast<char *>(hash.data()), hash.size()*sizeof(unsigned char));
-        f.write(reinterpret_cast<char *>(&nRow), sizeof(Eigen::Index));
-        f.write(reinterpret_cast<char *>(&nCol), sizeof(Eigen::Index));
-        t  = -usecond();
+        tHash   = -usecond();
+#ifdef USE_IPP
+        crc     = GridChecksum::crc32c(obj.data(), matSize);
+#else
+        crc     = GridChecksum::crc32(obj.data(), matSize);
+#endif
+        tHash  += usecond();
+        f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
+        f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
+        f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
+        tWrite = -usecond();
        f.write(reinterpret_cast<const char *>(obj.data()), matSize);
-        t += usecond();
-        DV_DEBUG_MSG(this, "Eigen write " << matSize/t*1.0e6/1024/1024 << " MB/s");
+        tWrite += usecond();
+        DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
+        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
+                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
    }
 };

@@ -207,7 +230,10 @@ DiskVectorBase<T>::DiskVectorBase(const std::string dirname,
                                  const unsigned int cacheSize,
                                  const bool clean)
 : dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean)
-, cachePtr_(new std::map<unsigned int, T>())
+, cachePtr_(new std::vector<T>(size))
+, modifiedPtr_(new std::vector<bool>(size, false))
+, indexPtr_(new std::map<unsigned int, unsigned int>())
+, freePtr_(new std::stack<unsigned int>)
 , loadsPtr_(new std::deque<unsigned int>())
 {
    struct stat s;
@@ -217,6 +243,10 @@ DiskVectorBase<T>::DiskVectorBase(const std::string dirname,
        HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
    }
    mkdir(dirname);
+    for (unsigned int i = 0; i < cacheSize_; ++i)
+    {
+        freePtr_->push(i);
+    }
 }

 template <typename T>
@@ -231,8 +261,10 @@ DiskVectorBase<T>::~DiskVectorBase(void)
 template <typename T>
 const T & DiskVectorBase<T>::operator[](const unsigned int i) const
 {
-    auto &cache  = *cachePtr_;
-    auto &loads  = *loadsPtr_;
+    auto &cache   = *cachePtr_;
+    auto &index   = *indexPtr_;
+    auto &freeInd = *freePtr_;
+    auto &loads   = *loadsPtr_;

    DV_DEBUG_MSG(this, "accessing " << i << " (RO)");

@@ -241,7 +273,7 @@ const T & DiskVectorBase<T>::operator[](const unsigned int i) const
        HADRONS_ERROR(Size, "index out of range");
    }
    const_cast<double &>(access_)++;
-    if (cache.find(i) == cache.end())
+    if (index.find(i) == index.end())
    {
        // cache miss
        DV_DEBUG_MSG(this, "cache miss");
@@ -268,7 +300,7 @@ const T & DiskVectorBase<T>::operator[](const unsigned int i) const
    DV_DEBUG_MSG(this, "in cache: " << msg);
 #endif

-    return cache.at(i);
+    return cache[index.at(i)];
 }

 template <typename T>
@@ -306,13 +338,24 @@ std::string DiskVectorBase<T>::filename(const unsigned int i) const
 template <typename T>
 void DiskVectorBase<T>::evict(void) const
 {
-    auto &cache = *cachePtr_;
-    auto &loads = *loadsPtr_;
+    auto &cache    = *cachePtr_;
+    auto &modified = *modifiedPtr_;
+    auto &index    = *indexPtr_;
+    auto &freeInd  = *freePtr_;
+    auto &loads    = *loadsPtr_;

-    if (cache.size() >= cacheSize_)
+    if (index.size() >= cacheSize_)
    {
-        DV_DEBUG_MSG(this, "evicting " << loads.front());
-        cache.erase(loads.front());
+        unsigned int i = loads.front();
+        
+        DV_DEBUG_MSG(this, "evicting " << i);
+        if (modified[index.at(i)])
+        {
+            DV_DEBUG_MSG(this, "element " << i << " modified, saving to disk");
+            save(filename(i), cache[index.at(i)]);
+        }
+        freeInd.push(index.at(i));
+        index.erase(i);
        loads.pop_front();
    }
 }
@@ -320,30 +363,44 @@ void DiskVectorBase<T>::evict(void) const
 template <typename T>
 void DiskVectorBase<T>::fetch(const unsigned int i) const
 {
-    auto &cache = *cachePtr_;
-    auto &loads = *loadsPtr_;
+    auto &cache    = *cachePtr_;
+    auto &modified = *modifiedPtr_;
+    auto &index    = *indexPtr_;
+    auto &freeInd  = *freePtr_;
+    auto &loads    = *loadsPtr_;
+
    struct stat s;

    DV_DEBUG_MSG(this, "loading " << i << " from disk");

    evict();
+    
    if(stat(filename(i).c_str(), &s) != 0)
    {
        HADRONS_ERROR(Io, "disk vector element " + std::to_string(i) + " uninitialised");
    }
-    load(cache[i], filename(i));
+    index[i] = freeInd.top();
+    freeInd.pop();
+    load(cache[index.at(i)], filename(i));
    loads.push_back(i);
+    modified[index.at(i)] = false;
 }

 template <typename T>
 void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
 {
-    auto &cache = *cachePtr_;
-    auto &loads = *loadsPtr_;
+    auto &cache    = *cachePtr_;
+    auto &modified = *modifiedPtr_;
+    auto &index    = *indexPtr_;
+    auto &freeInd  = *freePtr_;
+    auto &loads    = *loadsPtr_;

    evict();
-    cache[i] = obj;
+    index[i] = freeInd.top();
+    freeInd.pop();
+    cache[index.at(i)] = obj;
    loads.push_back(i);
+    modified[index.at(i)] = false;

 #ifdef DV_DEBUG
    std::string msg;
@@ -166,7 +166,13 @@ std::string Hadrons::dirname(const std::string &s)

 void Hadrons::makeFileDir(const std::string filename, GridBase *g)
 {
-    if (g->IsBoss())
+    bool doIt = true;
+
+    if (g)
+    {
+        doIt = g->IsBoss();
+    }
+    if (doIt)
    {
        std::string dir    = dirname(filename);
        int         status = mkdir(dir);
@@ -32,6 +32,7 @@ See the full license in the file "LICENSE" in the top level distribution directo

 #include <set>
 #include <stack>
+#include <regex>
 #include <Grid/Grid.h>
 #include <cxxabi.h>

@@ -217,15 +218,15 @@ typedef XmlReader ResultReader;
 typedef XmlWriter ResultWriter;
 #endif

-#define RESULT_FILE_NAME(name) \
-name + "." + std::to_string(vm().getTrajectory()) + "." + resultFileExt
+#define RESULT_FILE_NAME(name, traj) \
+name + "." + std::to_string(traj) + "." + resultFileExt

 // recursive mkdir
 #define MAX_PATH_LENGTH 512u
 int         mkdir(const std::string dirName);
 std::string basename(const std::string &s);
 std::string dirname(const std::string &s);
-void        makeFileDir(const std::string filename, GridBase *g);
+void        makeFileDir(const std::string filename, GridBase *g = nullptr);

 // default Schur convention
 #ifndef HADRONS_DEFAULT_SCHUR 
@@ -248,6 +249,20 @@ void        makeFileDir(const std::string filename, GridBase *g);
 // pretty print time profile
 void printTimeProfile(const std::map<std::string, GridTime> &timing, GridTime total);

+// token replacement utility
+template <typename T>
+void tokenReplace(std::string &str, const std::string token,
+                  const T &x, const std::string mark = "@")
+{
+    std::string fullToken = mark + token + mark;
+    
+    auto pos = str.find(fullToken);
+    if (pos != std::string::npos)
+    {
+        str.replace(pos, fullToken.size(), std::to_string(x));
+    }
+}
+
 END_HADRONS_NAMESPACE

 #include <Hadrons/Exceptions.hpp>
@@ -5,17 +5,17 @@ lib_LIBRARIES = libHadrons.a
 include modules.inc

 libHadrons_a_SOURCES = \
-    $(modules_cc)      \
    Application.cc     \
    Environment.cc     \
 	Exceptions.cc      \
    Global.cc          \
    Module.cc		   \
 	TimerArray.cc      \
-	VirtualMachine.cc
+	VirtualMachine.cc  \
+	$(modules_cc)
+	
 libHadrons_adir = $(includedir)/Hadrons
 nobase_libHadrons_a_HEADERS = \
-	$(modules_hpp)            \
 	A2AVectors.hpp            \
 	A2AMatrix.hpp             \
 	Application.hpp           \
@@ -33,4 +33,5 @@ nobase_libHadrons_a_HEADERS = \
 	ModuleFactory.hpp         \
 	Solver.hpp                \
 	TimerArray.hpp            \
-	VirtualMachine.hpp
+	VirtualMachine.hpp        \
+	$(modules_hpp)
@@ -144,7 +144,7 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
 {\
    makeFileDir(ioStem, env().getGrid());\
    {\
-        ResultWriter _writer(RESULT_FILE_NAME(ioStem));\
+        ResultWriter _writer(RESULT_FILE_NAME(ioStem, vm().getTrajectory()));\
        write(_writer, name, result);\
    }\
 }
@@ -24,7 +24,7 @@
 #include <Hadrons/Modules/MSolver/Guesser.hpp>
 #include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Hadrons/Modules/MSolver/A2AVectors.hpp>
-#include <Hadrons/Modules/MSolver/A2AAslashVector.hpp>
+#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
 #include <Hadrons/Modules/MGauge/UnitEm.hpp>
 #include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
 #include <Hadrons/Modules/MGauge/Unit.hpp>
@@ -33,10 +33,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AMatrix.hpp>

-#ifndef ASF_IO_TYPE
-#define ASF_IO_TYPE ComplexF
-#endif
-
 BEGIN_HADRONS_NAMESPACE

 /******************************************************************************
@@ -113,7 +109,7 @@ public:
    typedef A2AMatrixBlockComputation<Complex, 
                                      FermionField, 
                                      A2AAslashFieldMetadata, 
-                                      ASF_IO_TYPE> Computation;
+                                      HADRONS_A2AM_IO_TYPE> Computation;
    typedef AslashFieldKernel<Complex, FImpl> Kernel;
 public:
    // constructor
@@ -196,7 +192,7 @@ void TA2AAslashField<FImpl, PhotonImpl>::execute(void)
        LOG(Message) << "  " << name << std::endl;
    }
    LOG(Message) << "A-slash field size: " << nt << "*" << N_i << "*" << N_j 
-                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(ASF_IO_TYPE)) 
+                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
                 << "/EM field)" << std::endl;
    
    // preparing "B" complexified fields
@@ -35,10 +35,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AMatrix.hpp>

-#ifndef MF_IO_TYPE
-#define MF_IO_TYPE ComplexF
-#endif
-
 BEGIN_HADRONS_NAMESPACE

 /******************************************************************************
@@ -118,7 +114,7 @@ public:
    typedef A2AMatrixBlockComputation<Complex, 
                                      FermionField, 
                                      A2AMesonFieldMetadata, 
-                                      MF_IO_TYPE> Computation;
+                                      HADRONS_A2AM_IO_TYPE> Computation;
    typedef MesonFieldKernel<Complex, FImpl> Kernel;
 public:
    // constructor
@@ -248,7 +244,7 @@ void TA2AMesonField<FImpl>::execute(void)
        LOG(Message) << "  " << g << std::endl;
    }
    LOG(Message) << "Meson field size: " << nt << "*" << N_i << "*" << N_j 
-                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(MF_IO_TYPE)) 
+                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
                 << "/momentum/bilinear)" << std::endl;

    auto &ph = envGet(std::vector<ComplexField>, momphName_);
@@ -146,7 +146,7 @@ void TChargedProp::execute(void)
        std::vector<int>    siteCoor;

        LOG(Message) << "Saving momentum-projected propagator to '"
-                     << RESULT_FILE_NAME(par().output) << "'..."
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        result.projection.resize(par().outputMom.size());
        result.lattice_size = env().getGrid()->_fdimensions;
@@ -462,7 +462,7 @@ void TScalarVP::execute(void)
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected HVP to '"
-                     << RESULT_FILE_NAME(par().output) << "'..."
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        saveResult(par().output, "HVP", outputData);
    }
@@ -239,7 +239,7 @@ void TVPCounterTerms::execute(void)
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected correlators to '"
-                     << RESULT_FILE_NAME(par().output) << "'..."
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        saveResult(par().output, "scalar_loops", outputData);
    }
@@ -2,7 +2,7 @@

 Grid physics library, www.github.com/paboyle/Grid 

-Source file: Hadrons/Modules/MSolver/A2AAslashVector.cc
+Source file: Hadrons/Modules/MSolver/A2AAslashVectors.cc

 Copyright (C) 2015-2018

@@ -25,11 +25,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Hadrons/Modules/MSolver/A2AAslashVector.hpp>
+#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>

 using namespace Grid;
 using namespace Hadrons;
 using namespace MSolver;

-template class Grid::Hadrons::MSolver::TA2AAslashVector<FIMPL>;
-template class Grid::Hadrons::MSolver::TA2AAslashVector<ZFIMPL>;
+template class Grid::Hadrons::MSolver::TA2AAslashVectors<FIMPL>;
+template class Grid::Hadrons::MSolver::TA2AAslashVectors<ZFIMPL>;
@@ -2,7 +2,7 @@

 Grid physics library, www.github.com/paboyle/Grid 

-Source file: Hadrons/Modules/MSolver/A2AAslashVector.hpp
+Source file: Hadrons/Modules/MSolver/A2AAslashVectors.hpp

 Copyright (C) 2015-2018

@@ -25,13 +25,14 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_MSolver_A2AAslashVector_hpp_
-#define Hadrons_MSolver_A2AAslashVector_hpp_
+#ifndef Hadrons_MSolver_A2AAslashVectors_hpp_
+#define Hadrons_MSolver_A2AAslashVectors_hpp_

 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/Solver.hpp>
+#include <Hadrons/A2AVectors.hpp>

 BEGIN_HADRONS_NAMESPACE

@@ -54,29 +55,30 @@ BEGIN_MODULE_NAMESPACE(MSolver)
 *
 *****************************************************************************/

-
-class A2AAslashVectorPar: Serializable
+class A2AAslashVectorsPar: Serializable
 {
 public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashVectorPar,
+  GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashVectorsPar,
                                  std::string, vector,
                                  std::string, emField,
-                                  std::string, solver);
+                                  std::string, solver,
+                                  std::string, output,
+                                  bool,        multiFile);
 };

 template <typename FImpl>
-class TA2AAslashVector : public Module<A2AAslashVectorPar>
+class TA2AAslashVectors : public Module<A2AAslashVectorsPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    SOLVER_TYPE_ALIASES(FImpl,);
 public:
-    typedef PhotonR::GaugeField     EmField;
+    typedef PhotonR::GaugeField EmField;
 public:
    // constructor
-    TA2AAslashVector(const std::string name);
+    TA2AAslashVectors(const std::string name);
    // destructor
-    virtual ~TA2AAslashVector(void) {};
+    virtual ~TA2AAslashVectors(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
@@ -88,21 +90,21 @@ private:
    unsigned int Ls_;
 };

-MODULE_REGISTER_TMP(A2AAslashVector,TA2AAslashVector<FIMPL>, MSolver);
-MODULE_REGISTER_TMP(ZA2AAslashVector,TA2AAslashVector<ZFIMPL>, MSolver);
+MODULE_REGISTER_TMP(A2AAslashVectors, TA2AAslashVectors<FIMPL>, MSolver);
+MODULE_REGISTER_TMP(ZA2AAslashVectors, TA2AAslashVectors<ZFIMPL>, MSolver);

 /******************************************************************************
- *                       TA2AAslashVector implementation                       *
+ *                       TA2AAslashVectors implementation                       *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
-TA2AAslashVector<FImpl>::TA2AAslashVector(const std::string name)
-: Module<A2AAslashVectorPar>(name)
+TA2AAslashVectors<FImpl>::TA2AAslashVectors(const std::string name)
+: Module<A2AAslashVectorsPar>(name)
 {}

 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
-std::vector<std::string> TA2AAslashVector<FImpl>::getInput(void)
+std::vector<std::string> TA2AAslashVectors<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().vector, par().emField, par().solver};

@@ -110,7 +112,7 @@ std::vector<std::string> TA2AAslashVector<FImpl>::getInput(void)
 }

 template <typename FImpl>
-std::vector<std::string> TA2AAslashVector<FImpl>::getOutput(void)
+std::vector<std::string> TA2AAslashVectors<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};

@@ -119,7 +121,7 @@ std::vector<std::string> TA2AAslashVector<FImpl>::getOutput(void)

 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TA2AAslashVector<FImpl>::setup(void)
+void TA2AAslashVectors<FImpl>::setup(void)
 {
    Ls_  = env().getObjectLs(par().solver);
    auto &vvector = envGet(std::vector<FermionField>, par().vector);
@@ -134,7 +136,7 @@ void TA2AAslashVector<FImpl>::setup(void)

 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TA2AAslashVector<FImpl>::execute(void)
+void TA2AAslashVectors<FImpl>::execute(void)
 {
    auto &solver = envGet(Solver, par().solver);
    auto &stoch_photon = envGet(EmField,  par().emField);
@@ -148,42 +150,45 @@ void TA2AAslashVector<FImpl>::execute(void)

    Complex ci(0.0,1.0);

-
    startTimer("Seq Aslash");
-
-    LOG(Message) << "Calculate Sequential propagator on Aslash * v with the A2A vector " << par().vector
-                  << " and the photon field " << par().emField << std::endl;
-
-
+    LOG(Message) << "Calculate Sequential propagator on Aslash * v with the A2A vector " 
+                 << par().vector << " and the photon field " << par().emField << std::endl;
    for(unsigned int i=0; i<Nmodes; i++)
    {
-	v4dtmp = zero;
-	startTimer("Multiply Aslash");
-	for(unsigned int mu=0;mu<=3;mu++)
-    	{
-	    Gamma gmu(Gamma::gmu[mu]);
-		 v4dtmp +=  ci * PeekIndex<LorentzIndex>(stoch_photon, mu) * (gmu * vvector[i]);
-	}
-	stopTimer("Multiply Aslash");
+        v4dtmp = zero;
+        startTimer("Multiply Aslash");
+        for(unsigned int mu=0;mu<=3;mu++)
+        {
+            Gamma gmu(Gamma::gmu[mu]);
+            v4dtmp +=  ci * PeekIndex<LorentzIndex>(stoch_photon, mu) * (gmu * vvector[i]);
+        }
+        stopTimer("Multiply Aslash");

-	if (Ls_ == 1)
-	{
-	    solver(Aslashv[i], v4dtmp);
-	}
-	else
-	{
-	    mat.ImportPhysicalFermionSource(v4dtmp, v5dtmp);
-	    solver(v5dtmp_sol, v5dtmp);
-	    mat.ExportPhysicalFermionSolution(v5dtmp_sol, v4dtmp);
-	    Aslashv[i] = v4dtmp;
-	}
+        startTimer("Inversion");
+        if (Ls_ == 1)
+        {
+            solver(Aslashv[i], v4dtmp);
+        }
+        else
+        {
+            mat.ImportPhysicalFermionSource(v4dtmp, v5dtmp);
+            solver(v5dtmp_sol, v5dtmp);
+            mat.ExportPhysicalFermionSolution(v5dtmp_sol, v4dtmp);
+            Aslashv[i] = v4dtmp;
+        }
+        stopTimer("Inversion");
    }
-
    stopTimer("Seq Aslash");
+    if (!par().output.empty())
+    {
+        startTimer("I/O");
+        A2AVectorsIo::write(par().output, Aslashv, par().multiFile, vm().getTrajectory());
+        stopTimer("I/O");
+    }
 }

 END_MODULE_NAMESPACE

 END_HADRONS_NAMESPACE

-#endif // Hadrons_MSolver_A2AAslashVector_hpp_
+#endif // Hadrons_MSolver_A2AAslashVectors_hpp_
@@ -0,0 +1,454 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Utilities/Contractor.cc
+
+Copyright (C) 2015-2018
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+#include <Hadrons/DiskVector.hpp>
+#include <Hadrons/TimerArray.hpp>
+
+using namespace Grid;
+using namespace QCD;
+using namespace Hadrons;
+
+#define TIME_MOD(t) (((t) + par.global.nt) % par.global.nt)
+
+namespace Contractor
+{
+    class TrajRange: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
+                                        unsigned int, start,
+                                        unsigned int, end,
+                                        unsigned int, step);
+    };
+    
+    class GlobalPar: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar,
+                                        TrajRange, trajCounter,
+                                        unsigned int, nt,
+                                        std::string, diskVectorDir,
+                                        std::string, output);
+    };
+
+    class A2AMatrixPar: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMatrixPar,
+                                        std::string, file,
+                                        std::string, dataset,
+                                        unsigned int, cacheSize,
+                                        std::string, name);
+    };
+
+    class ProductPar: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(ProductPar,
+                                        std::string, terms,
+                                        std::vector<std::string>, times,
+                                        std::string, translations,
+                                        bool, translationAverage);
+    };
+
+    class CorrelatorResult: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(CorrelatorResult,
+                                        std::vector<Contractor::A2AMatrixPar>,  a2aMatrix,
+                                        ProductPar, contraction,
+                                        std::vector<unsigned int>, times,
+                                        std::vector<ComplexD>, correlator);
+    };
+}
+
+struct ContractorPar
+{
+    Contractor::GlobalPar                  global;
+    std::vector<Contractor::A2AMatrixPar>  a2aMatrix;
+    std::vector<Contractor::ProductPar>    product;
+};
+
+void makeTimeSeq(std::vector<std::vector<unsigned int>> &timeSeq, 
+                 const std::vector<std::set<unsigned int>> &times,
+                 std::vector<unsigned int> &current,
+                 const unsigned int depth)
+{
+    if (depth > 0)
+    {
+        for (auto t: times[times.size() - depth])
+        {
+            current[times.size() - depth] = t;
+            makeTimeSeq(timeSeq, times, current, depth - 1);
+        }
+    }
+    else
+    {
+        timeSeq.push_back(current);
+    }
+}
+
+void makeTimeSeq(std::vector<std::vector<unsigned int>> &timeSeq, 
+                 const std::vector<std::set<unsigned int>> &times)
+{
+    std::vector<unsigned int> current(times.size());
+
+    makeTimeSeq(timeSeq, times, current, times.size());
+}
+
+void saveCorrelator(const Contractor::CorrelatorResult &result, const std::string dir, 
+                    const unsigned int dt, const unsigned int traj)
+{
+    std::string              fileStem = "", filename;
+    std::vector<std::string> terms = strToVec<std::string>(result.contraction.terms);
+
+    for (unsigned int i = 0; i < terms.size() - 1; i++)
+    {
+        fileStem += terms[i] + "_" + std::to_string(result.times[i]) + "_";
+    }
+    fileStem += terms.back();
+    if (!result.contraction.translationAverage)
+    {
+        fileStem += "_dt_" + std::to_string(dt);
+    }
+    filename = dir + "/" + RESULT_FILE_NAME(fileStem, traj);
+    std::cout << "Saving correlator to '" << filename << "'" << std::endl;
+    makeFileDir(dir);
+    ResultWriter writer(filename);
+    write(writer, fileStem, result);
+}
+
+std::set<unsigned int> parseTimeRange(const std::string str, const unsigned int nt)
+{
+    std::regex               rex("([0-9]+)|(([0-9]+)\\.\\.([0-9]+))");
+    std::smatch              sm;
+    std::vector<std::string> rstr = strToVec<std::string>(str);
+    std::set<unsigned int>   tSet;
+
+    for (auto &s: rstr)
+    {
+        std::regex_match(s, sm, rex);
+        if (sm[1].matched)
+        {
+            unsigned int t;
+            
+            t = std::stoi(sm[1].str());
+            if (t >= nt)
+            {
+                HADRONS_ERROR(Range, "time out of range (from expression '" + str + "')");
+            }
+            tSet.insert(t);
+        }
+        else if (sm[2].matched)
+        {
+            unsigned int ta, tb;
+
+            ta = std::stoi(sm[3].str());
+            tb = std::stoi(sm[4].str());
+            if ((ta >= nt) or (tb >= nt))
+            {
+                HADRONS_ERROR(Range, "time out of range (from expression '" + str + "')");
+            }
+            for (unsigned int ti = ta; ti <= tb; ++ti)
+            {
+                tSet.insert(ti);
+            }
+        }
+    }
+
+    return tSet;
+}
+
+struct Sec
+{
+    Sec(const double usec)
+    {
+        seconds = usec/1.0e6;
+    }
+    
+    double seconds;
+};
+
+inline std::ostream & operator<< (std::ostream& s, const Sec &&sec)
+{
+    s << std::setw(10) << sec.seconds << " sec";
+
+    return s;
+}
+
+struct Flops
+{
+    Flops(const double flops, const double fusec)
+    {
+        gFlopsPerSec = flops/fusec/1.0e3;
+    }
+    
+    double gFlopsPerSec;
+};
+
+inline std::ostream & operator<< (std::ostream& s, const Flops &&f)
+{
+    s << std::setw(10) << f.gFlopsPerSec << " GFlop/s";
+
+    return s;
+}
+
+struct Bytes
+{
+    Bytes(const double bytes, const double busec)
+    {
+        gBytesPerSec = bytes/busec*1.0e6/1024/1024/1024;
+    }
+    
+    double gBytesPerSec;
+};
+
+inline std::ostream & operator<< (std::ostream& s, const Bytes &&b)
+{
+    s << std::setw(10) << b.gBytesPerSec << " GB/s";
+
+    return s;
+}
+
+int main(int argc, char* argv[])
+{
+    // parse command line
+    std::string   parFilename;
+
+    if (argc != 2)
+    {
+        std::cerr << "usage: " << argv[0] << " <parameter file>";
+        std::cerr << std::endl;
+        
+        return EXIT_FAILURE;
+    }
+    parFilename = argv[1];
+
+    // parse parameter file
+    ContractorPar par;
+    unsigned int  nMat, nCont;
+    XmlReader     reader(parFilename);
+
+    read(reader, "global",    par.global);
+    read(reader, "a2aMatrix", par.a2aMatrix);
+    read(reader, "product",   par.product);
+    nMat  = par.a2aMatrix.size();
+    nCont = par.product.size();
+
+    // create diskvectors
+    std::map<std::string, EigenDiskVector<ComplexD>> a2aMat;
+    unsigned int                                     cacheSize;
+
+    for (auto &p: par.a2aMatrix)
+    {
+        std::string dirName = par.global.diskVectorDir + "/" + p.name;
+
+        a2aMat.emplace(p.name, EigenDiskVector<ComplexD>(dirName, par.global.nt, p.cacheSize));
+    }
+
+    // trajectory loop
+    for (unsigned int traj = par.global.trajCounter.start; 
+         traj < par.global.trajCounter.end; traj += par.global.trajCounter.step)
+    {
+        std::cout << ":::::::: Trajectory " << traj << std::endl;
+
+        // load data
+        for (auto &p: par.a2aMatrix)
+        {
+            std::string filename = p.file;
+            double      t, size;
+
+            tokenReplace(filename, "traj", traj);
+            std::cout << "======== Loading '" << filename << "'" << std::endl;
+
+            A2AMatrixIo<HADRONS_A2AM_IO_TYPE> a2aIo(filename, p.dataset, par.global.nt);
+
+            a2aIo.load(a2aMat.at(p.name), &t);
+            std::cout << "Read " << a2aIo.getSize() << " bytes in " << t/1.0e6 
+                    << " sec, " << a2aIo.getSize()/t*1.0e6/1024/1024 << " MB/s" << std::endl;
+        }
+
+        // contract
+        EigenDiskVector<ComplexD>::Matrix buf;
+
+        for (auto &p: par.product)
+        {
+            std::vector<std::string>               term = strToVec<std::string>(p.terms);
+            std::vector<std::set<unsigned int>>    times;
+            std::vector<std::vector<unsigned int>> timeSeq;
+            std::set<unsigned int>                 translations;
+            std::vector<A2AMatrixTr<ComplexD>>     lastTerm(par.global.nt);
+            A2AMatrix<ComplexD>                    prod, buf, tmp;
+            TimerArray                             tAr;
+            double                                 fusec, busec, flops, bytes, tusec;
+            Contractor::CorrelatorResult           result;             
+
+            tAr.startTimer("Total");
+            std::cout << "======== Contraction tr(";
+            for (unsigned int g = 0; g < term.size(); ++g)
+            {
+                std::cout << term[g] << ((g == term.size() - 1) ? ')' : '*');
+            }
+            std::cout << std::endl;
+            if (term.size() != p.times.size() + 1)
+            {
+                HADRONS_ERROR(Size, "number of terms (" + std::to_string(term.size()) 
+                            + ") different from number of times (" 
+                            + std::to_string(p.times.size() + 1) + ")");
+            }
+            for (auto &s: p.times)
+            {
+                times.push_back(parseTimeRange(s, par.global.nt));
+            }
+            for (auto &m: par.a2aMatrix)
+            {
+                if (std::find(result.a2aMatrix.begin(), result.a2aMatrix.end(), m) == result.a2aMatrix.end())
+                {
+                    result.a2aMatrix.push_back(m);
+                    tokenReplace(result.a2aMatrix.back().file, "traj", traj);
+                }
+            }
+            result.contraction = p;
+            result.correlator.resize(par.global.nt, 0.);
+
+            translations = parseTimeRange(p.translations, par.global.nt);
+            makeTimeSeq(timeSeq, times);
+            std::cout << timeSeq.size()*translations.size()*(term.size() - 2) << " A*B, "
+                    << timeSeq.size()*translations.size()*par.global.nt << " tr(A*B)"
+                    << std::endl;
+
+            std::cout << "* Caching transposed last term" << std::endl;
+            for (unsigned int t = 0; t < par.global.nt; ++t)
+            {
+                tAr.startTimer("Disk vector overhead");
+                const A2AMatrix<ComplexD> &ref = a2aMat.at(term.back())[t];
+                tAr.stopTimer("Disk vector overhead");
+
+                tAr.startTimer("Transpose caching");
+                lastTerm[t].resize(ref.rows(), ref.cols());
+                parallel_for (unsigned int j = 0; j < ref.cols(); ++j)
+                for (unsigned int i = 0; i < ref.rows(); ++i)
+                {
+                    lastTerm[t](i, j) = ref(i, j);
+                }
+                tAr.stopTimer("Transpose caching");
+            }
+            bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols()*sizeof(ComplexD);
+            std::cout << Sec(tAr.getDTimer("Transpose caching")) << " " 
+                      << Bytes(bytes, tAr.getDTimer("Transpose caching")) << std::endl;
+            for (unsigned int i = 0; i < timeSeq.size(); ++i)
+            {
+                unsigned int dti = 0;
+                auto         &t = timeSeq[i];
+
+                result.times = t;
+                for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                {
+                    result.correlator[tLast] = 0.;
+                }
+                for (auto &dt: translations)
+                {
+                    std::cout << "* Step " << i*translations.size() + dti + 1
+                            << "/" << timeSeq.size()*translations.size()
+                            << " -- positions= " << t << ", dt= " << dt << std::endl;
+                    if (term.size() > 2)
+                    {
+                        std::cout << std::setw(8) << "products";
+                    }
+                    flops  = 0.;
+                    bytes  = 0.;
+                    fusec  = tAr.getDTimer("A*B algebra");
+                    busec  = tAr.getDTimer("A*B total");
+                    tAr.startTimer("Linear algebra");
+                    tAr.startTimer("Disk vector overhead");
+                    prod = a2aMat.at(term[0])[TIME_MOD(t[0] + dt)];
+                    tAr.stopTimer("Disk vector overhead");
+                    for (unsigned int j = 1; j < term.size() - 1; ++j)
+                    {
+                        tAr.startTimer("Disk vector overhead");
+                        const A2AMatrix<ComplexD> &ref = a2aMat.at(term[j])[TIME_MOD(t[j] + dt)];
+                        tAr.stopTimer("Disk vector overhead");
+                        
+                        tAr.startTimer("A*B total");
+                        tAr.startTimer("A*B algebra");
+                        A2AContraction::mul(tmp, prod, ref);
+                        tAr.stopTimer("A*B algebra");
+                        flops += A2AContraction::mulFlops(prod, ref);
+                        prod   = tmp;
+                        tAr.stopTimer("A*B total");
+                        bytes += 3.*tmp.rows()*tmp.cols()*sizeof(ComplexD);
+                    }
+                    if (term.size() > 2)
+                    {
+                        std::cout << Sec(tAr.getDTimer("A*B total") - busec) << " "
+                                << Flops(flops, tAr.getDTimer("A*B algebra") - fusec) << " " 
+                                << Bytes(bytes, tAr.getDTimer("A*B total") - busec) << std::endl;
+                    }
+                    std::cout << std::setw(8) << "traces";
+                    flops  = 0.;
+                    bytes  = 0.;
+                    fusec  = tAr.getDTimer("tr(A*B)");
+                    busec  = tAr.getDTimer("tr(A*B)");
+                    for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                    {
+                        tAr.startTimer("tr(A*B)");
+                        A2AContraction::accTrMul(result.correlator[TIME_MOD(tLast - dt)], prod, lastTerm[tLast]);
+                        tAr.stopTimer("tr(A*B)");
+                        flops += A2AContraction::accTrMulFlops(prod, lastTerm[tLast]);
+                        bytes += 2.*prod.rows()*prod.cols()*sizeof(ComplexD);
+                    }
+                    tAr.stopTimer("Linear algebra");
+                    std::cout << Sec(tAr.getDTimer("tr(A*B)") - busec) << " "
+                            << Flops(flops, tAr.getDTimer("tr(A*B)") - fusec) << " " 
+                            << Bytes(bytes, tAr.getDTimer("tr(A*B)") - busec) << std::endl;
+                    if (!p.translationAverage)
+                    {
+                        saveCorrelator(result, par.global.output, dt, traj);
+                        for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                        {
+                            result.correlator[tLast] = 0.;
+                        }
+                    }
+                    dti++;
+                }
+                if (p.translationAverage)
+                {
+                    for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                    {
+                        result.correlator[tLast] /= translations.size();
+                    }
+                    saveCorrelator(result, par.global.output, 0, traj);
+                }
+            }
+            tAr.stopTimer("Total");
+            printTimeProfile(tAr.getTimings(), tAr.getTimer("Total"));
+        }
+    }
+    
+    return EXIT_SUCCESS;
+}
@@ -0,0 +1,12 @@
+#ifndef  Hadrons_Contractor_hpp_
+#define Hadrons_Contractor_hpp_
+
+#include <Hadrons/Global.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_Contractor_hpp_
@@ -0,0 +1,434 @@
+#include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+#ifdef USE_MKL
+#include "mkl.h"
+#include "mkl_cblas.h"
+#endif
+
+using namespace Grid;
+using namespace Hadrons;
+
+#ifdef GRID_COMMS_MPI3
+#define GET_RANK(rank, nMpi) \
+MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
+MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
+#define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
+#define INIT() MPI_Init(NULL, NULL)
+#define FINALIZE() MPI_Finalize()
+#else
+#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
+#define BARRIER()
+#define INIT()
+#define FINALIZE()
+#endif
+
+template <typename Function, typename MatLeft, typename MatRight>
+inline void trBenchmark(const std::string name, const MatLeft &left,
+                        const MatRight &right, const ComplexD ref, Function fn)
+{
+    double       t, flops, bytes, n = left[0].rows()*left[0].cols();
+    unsigned int nMat = left.size();
+    int          nMpi, rank;
+    ComplexD     buf;
+
+    t = 0.;
+    GET_RANK(rank, nMpi);
+    t = -usecond();
+    BARRIER();
+    for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
+    {
+        fn(buf, left[i], right[i]);      
+    }
+    BARRIER();
+    t += usecond();
+    flops = nMat*(6.*n + 2.*(n - 1.));
+    bytes = nMat*(2.*n*sizeof(ComplexD));
+
+    if (rank == 0)
+    {
+        std::cout << std::setw(34) << name << ": diff= "
+                  << std::setw(12) << std::norm(buf-ref)
+                  << std::setw(10) << t/1.0e6 << " sec "
+                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+                  << std::endl;
+    }
+    ::sleep(1);
+}
+
+template <typename Function, typename MatV, typename Mat>
+inline void mulBenchmark(const std::string name, const MatV &left,
+                         const MatV &right, const Mat &ref, Function fn)
+{
+    double       t, flops, bytes;
+    double       nr = left[0].rows(), nc = left[0].cols(), n = nr*nc;
+    unsigned int nMat = left.size();
+    int          nMpi, rank;
+    Mat          buf(left[0].rows(), left[0].rows());
+
+    t = 0.;
+    GET_RANK(rank, nMpi);
+    t = -usecond();
+    BARRIER();
+    for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
+    {
+        fn(buf, left[i], right[i]);
+    }
+    BARRIER();
+    t += usecond();
+    flops = nMat*(nr*nr*(6.*nc + 2.*(nc - 1.)));
+    bytes = nMat*(2*nc*nr*sizeof(ComplexD));
+
+    if (rank == 0)
+    {
+        std::cout << std::setw(34) << name << ": diff= "
+                  << std::setw(12) << (buf-ref).squaredNorm()
+                  << std::setw(10) << t/1.0e6 << " sec "
+                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+                  << std::endl;
+    }
+    ::sleep(1);
+}
+
+#ifdef USE_MKL
+template <typename MatLeft, typename MatRight>
+static inline void zdotuRow(ComplexD &res, const unsigned int aRow,
+                            const MatLeft &a, const MatRight &b)
+{
+    const ComplexD *aPt, *bPt;
+    unsigned int   aInc, bInc;
+
+    if (MatLeft::Options == Eigen::RowMajor)
+    {
+        aPt  = a.data() + aRow*a.cols();
+        aInc = 1;
+    }
+    else if (MatLeft::Options == Eigen::ColMajor)
+    {
+        aPt  = a.data() + aRow;
+        aInc = a.rows();
+    }
+    if (MatRight::Options == Eigen::RowMajor)
+    {
+        bPt  = b.data() + aRow;
+        bInc = b.cols();
+    }
+    else if (MatRight::Options == Eigen::ColMajor)
+    {
+        bPt  = b.data() + aRow*b.rows();
+        bInc = 1;
+    }
+    cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+}
+
+template <typename MatLeft, typename MatRight>
+static inline void zdotuCol(ComplexD &res, const unsigned int aCol,
+                            const MatLeft &a, const MatRight &b)
+{
+    const ComplexD *aPt, *bPt;
+    unsigned int   aInc, bInc;
+
+    if (MatLeft::Options == Eigen::RowMajor)
+    {
+        aPt  = a.data() + aCol;
+        aInc = a.cols();
+    }
+    else if (MatLeft::Options == Eigen::ColMajor)
+    {
+        aPt  = a.data() + aCol*a.rows();
+        aInc = 1;
+    }
+    if (MatRight::Options == Eigen::RowMajor)
+    {
+        bPt  = b.data() + aCol*b.cols();
+        bInc = 1;
+    }
+    else if (MatRight::Options == Eigen::ColMajor)
+    {
+        bPt  = b.data() + aCol;
+        bInc = b.rows();
+    }
+    cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+}
+#endif
+
+template <typename MatLeft, typename MatRight>
+void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
+{
+    std::vector<MatLeft>  left;
+    std::vector<MatRight> right;
+    MatRight              buf;
+    ComplexD              ref;
+    int                   rank, nMpi;
+
+    left.resize(nMat, MatLeft::Random(ni, nj));
+    right.resize(nMat, MatRight::Random(nj, ni));
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "==== tr(A*B) benchmarks" << std::endl;
+        std::cout << "A matrices use ";
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << "B matrices use ";
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    BARRIER();
+    ref = (left.back()*right.back()).trace();
+    trBenchmark("Hadrons A2AContraction::accTrMul", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = 0.;
+        A2AContraction::accTrMul(res, a, b);
+    });
+    trBenchmark("Naive loop rows first", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        auto nr = a.rows(), nc = a.cols();
+        
+        res = 0.;
+        parallel_for (unsigned int i = 0; i < nr; ++i)
+        {
+            ComplexD tmp = 0.;
+
+            for (unsigned int j = 0; j < nc; ++j)
+            {
+                tmp += a(i, j)*b(j, i);
+            }
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Naive loop cols first", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        auto nr = a.rows(), nc = a.cols();
+        
+        res = 0.;
+        parallel_for (unsigned int j = 0; j < nc; ++j)
+        {
+            ComplexD tmp = 0.;
+
+            for (unsigned int i = 0; i < nr; ++i)
+            {
+                tmp += a(i, j)*b(j, i);
+            }        
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Eigen tr(A*B)", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = (a*b).trace();
+    });
+    trBenchmark("Eigen row-wise dot", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+        {
+            ComplexD tmp;
+
+            tmp = a.row(r).conjugate().dot(b.col(r));
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Eigen col-wise dot", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+        {
+            ComplexD tmp;
+
+            tmp = a.col(c).conjugate().dot(b.row(c));
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Eigen Hadamard", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = a.cwiseProduct(b.transpose()).sum();
+    });
+#ifdef USE_MKL
+    trBenchmark("MKL row-wise zdotu", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+        {
+            ComplexD tmp;
+
+            zdotuRow(tmp, r, a, b);
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("MKL col-wise zdotu", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+        {
+            ComplexD tmp;
+
+            zdotuCol(tmp, c, a, b);
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+#endif
+    BARRIER();
+    if (rank == 0)
+    {
+        std::cout << std::endl;
+    }
+}
+
+template <typename Mat>
+void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
+{
+    std::vector<Mat> left, right;
+    Mat              ref;
+    int              rank, nMpi;
+
+    left.resize(nMat, Mat::Random(ni, nj));
+    right.resize(nMat, Mat::Random(nj, ni));
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "==== A*B benchmarks" << std::endl;
+        std::cout << "all matrices use ";
+        if (Mat::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (Mat::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    BARRIER();
+    ref = left.back()*right.back();
+    mulBenchmark("Hadrons A2AContraction::mul", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    { 
+        A2AContraction::mul(res, a, b);
+    });
+    mulBenchmark("Eigen A*B", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    { 
+        res = a*b;
+    });
+#ifdef USE_MKL
+    mulBenchmark("MKL A*B", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    {
+        const ComplexD one(1., 0.), zero(0., 0.);
+        if (Mat::Options == Eigen::RowMajor)
+        {
+            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat::Options == Eigen::ColMajor)
+        {
+            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    });
+#endif
+    BARRIER();
+    if (rank == 0)
+    {
+        std::cout << std::endl;
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    // parse command line
+    Eigen::Index ni, nj, nMat;
+    int          nMpi, rank;
+
+    if (argc != 4)
+    {
+        std::cerr << "usage: " << argv[0] << " <Ni> <Nj> <#matrices>";
+        std::cerr << std::endl;
+        
+        return EXIT_FAILURE;
+    }
+    ni   = std::stoi(argv[1]);
+    nj   = std::stoi(argv[2]);
+    nMat = std::stoi(argv[3]);
+
+    INIT();
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "\n*** ALL-TO-ALL MATRIX CONTRACTION BENCHMARK ***\n" << std::endl;
+        std::cout << nMat << " couples of " << ni << "x" << nj << " matrices\n" << std::endl;
+
+        std::cout << nMpi << " MPI processes" << std::endl;
+#ifdef GRID_OMP
+        #pragma omp parallel
+        {
+            #pragma omp single
+            std::cout << omp_get_num_threads() << " threads\n" << std::endl; 
+        }
+#else
+        std::cout << "Single-threaded\n" << std::endl; 
+#endif
+
+#ifdef EIGEN_USE_MKL_ALL
+        std::cout << "Eigen uses the MKL" << std::endl;
+#endif
+        std::cout << "Eigen uses " << Eigen::nbThreads() << " threads" << std::endl;
+#ifdef USE_MKL
+        std::cout << "MKL   uses " << mkl_get_max_threads() << " threads" << std::endl;
+#endif
+        std::cout << std::endl;
+    }
+
+    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    fullMulBenchmark<A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullMulBenchmark<A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    FINALIZE();
+
+    return EXIT_SUCCESS;
+}
@@ -1,4 +1,4 @@
-bin_PROGRAMS = HadronsXmlRun HadronsFermionEP64To32
+bin_PROGRAMS = HadronsXmlRun HadronsFermionEP64To32 HadronsContractor HadronsContractorBenchmark

 HadronsXmlRun_SOURCES = HadronsXmlRun.cc
 HadronsXmlRun_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
@@ -6,3 +6,9 @@ HadronsXmlRun_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
 HadronsFermionEP64To32_SOURCES  = EigenPackCast.cc
 HadronsFermionEP64To32_CXXFLAGS = $(AM_CXXFLAGS) -DFIN=WilsonImplD::FermionField -DFOUT=WilsonImplF::FermionField
 HadronsFermionEP64To32_LDADD    = ../libHadrons.a ../../Grid/libGrid.a
+
+HadronsContractor_SOURCES = Contractor.cc
+HadronsContractor_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
+
+HadronsContractorBenchmark_SOURCES = ContractorBenchmark.cc
+HadronsContractorBenchmark_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
@@ -20,7 +20,7 @@ modules_cc =\
  Modules/MSink/Point.cc \
  Modules/MSink/Smear.cc \
  Modules/MSolver/A2AVectors.cc \
-  Modules/MSolver/A2AAslashVector.cc \
+  Modules/MSolver/A2AAslashVectors.cc \
  Modules/MSolver/RBPrecCG.cc \
  Modules/MSolver/MixedPrecisionRBPrecCG.cc \
  Modules/MSolver/LocalCoherenceLanczos.cc \
@@ -97,7 +97,7 @@ modules_hpp =\
  Modules/MSolver/Guesser.hpp \
  Modules/MSolver/RBPrecCG.hpp \
  Modules/MSolver/A2AVectors.hpp \
-  Modules/MSolver/A2AAslashVector.hpp \
+  Modules/MSolver/A2AAslashVectors.hpp \
  Modules/MGauge/UnitEm.hpp \
  Modules/MGauge/StoutSmearing.hpp \
  Modules/MGauge/Unit.hpp \
@@ -123,10 +123,13 @@ case ${ac_SFW_FP16} in
      AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
 esac

-############### MKL
+############### Intel libraries
 AC_ARG_ENABLE([mkl],
    [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
    [ac_MKL=${enable_mkl}], [ac_MKL=no])
+AC_ARG_ENABLE([ipp],
+    [AC_HELP_STRING([--enable-ipp=yes|no|prefix], [enable Intel IPP for fast CRC32C])],
+    [ac_IPP=${enable_mkl}], [ac_IPP=no])

 case ${ac_MKL} in
    no)
@@ -139,6 +142,17 @@ case ${ac_MKL} in
        AC_DEFINE([USE_MKL], [1], [Define to 1 if you use the Intel MKL]);;
 esac

+case ${ac_IPP} in
+    no)
+        ;;
+    yes)
+        AC_DEFINE([USE_IPP], [1], [Define to 1 if you use the Intel IPP]);;
+    *)
+        AM_CXXFLAGS="-I$ac_IPP/include $AM_CXXFLAGS"
+        AM_LDFLAGS="-L$ac_IPP/lib $AM_LDFLAGS"
+        AC_DEFINE([USE_IPP], [1], [Define to 1 if you use the Intel IPP]);;
+esac
+
 ############### HDF5
 AC_ARG_WITH([hdf5],
    [AS_HELP_STRING([--with-hdf5=prefix],
@@ -170,7 +184,13 @@ AC_CHECK_FUNCS([gettimeofday])

 if test "${ac_MKL}x" != "nox"; then
    AC_SEARCH_LIBS([mkl_set_interface_layer], [mkl_rt], [],
-                   [AC_MSG_ERROR("MKL enabled but library not found")])
+                   [AC_MSG_ERROR("Intel MKL enabled but library not found")])
+fi
+
+if test "${ac_IPP}x" != "nox"; then
+     AC_SEARCH_LIBS([ippsCRC32C_8u], [ippdc], 
+                   [LIBS="${LIBS} -lippdc -lippvm -lipps -lippcore"],
+                   [AC_MSG_ERROR("Intel IPP enabled but library not found")])
 fi

 AC_SEARCH_LIBS([__gmpf_init], [gmp],