Merge branch 'develop' into feature/gpu-port

2026-07-18 16:13:28 +01:00 · 2018-12-13 05:11:34 +00:00
parent adbdc4e65b c509bd3fe2
commit b57a4d32aa
647 changed files with 49155 additions and 11160 deletions
@@ -0,0 +1,454 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Utilities/Contractor.cc
+
+Copyright (C) 2015-2018
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+#include <Hadrons/DiskVector.hpp>
+#include <Hadrons/TimerArray.hpp>
+
+using namespace Grid;
+using namespace QCD;
+using namespace Hadrons;
+
+#define TIME_MOD(t) (((t) + par.global.nt) % par.global.nt)
+
+namespace Contractor
+{
+    class TrajRange: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
+                                        unsigned int, start,
+                                        unsigned int, end,
+                                        unsigned int, step);
+    };
+    
+    class GlobalPar: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar,
+                                        TrajRange, trajCounter,
+                                        unsigned int, nt,
+                                        std::string, diskVectorDir,
+                                        std::string, output);
+    };
+
+    class A2AMatrixPar: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMatrixPar,
+                                        std::string, file,
+                                        std::string, dataset,
+                                        unsigned int, cacheSize,
+                                        std::string, name);
+    };
+
+    class ProductPar: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(ProductPar,
+                                        std::string, terms,
+                                        std::vector<std::string>, times,
+                                        std::string, translations,
+                                        bool, translationAverage);
+    };
+
+    class CorrelatorResult: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(CorrelatorResult,
+                                        std::vector<Contractor::A2AMatrixPar>,  a2aMatrix,
+                                        ProductPar, contraction,
+                                        std::vector<unsigned int>, times,
+                                        std::vector<ComplexD>, correlator);
+    };
+}
+
+struct ContractorPar
+{
+    Contractor::GlobalPar                  global;
+    std::vector<Contractor::A2AMatrixPar>  a2aMatrix;
+    std::vector<Contractor::ProductPar>    product;
+};
+
+void makeTimeSeq(std::vector<std::vector<unsigned int>> &timeSeq, 
+                 const std::vector<std::set<unsigned int>> &times,
+                 std::vector<unsigned int> &current,
+                 const unsigned int depth)
+{
+    if (depth > 0)
+    {
+        for (auto t: times[times.size() - depth])
+        {
+            current[times.size() - depth] = t;
+            makeTimeSeq(timeSeq, times, current, depth - 1);
+        }
+    }
+    else
+    {
+        timeSeq.push_back(current);
+    }
+}
+
+void makeTimeSeq(std::vector<std::vector<unsigned int>> &timeSeq, 
+                 const std::vector<std::set<unsigned int>> &times)
+{
+    std::vector<unsigned int> current(times.size());
+
+    makeTimeSeq(timeSeq, times, current, times.size());
+}
+
+void saveCorrelator(const Contractor::CorrelatorResult &result, const std::string dir, 
+                    const unsigned int dt, const unsigned int traj)
+{
+    std::string              fileStem = "", filename;
+    std::vector<std::string> terms = strToVec<std::string>(result.contraction.terms);
+
+    for (unsigned int i = 0; i < terms.size() - 1; i++)
+    {
+        fileStem += terms[i] + "_" + std::to_string(result.times[i]) + "_";
+    }
+    fileStem += terms.back();
+    if (!result.contraction.translationAverage)
+    {
+        fileStem += "_dt_" + std::to_string(dt);
+    }
+    filename = dir + "/" + RESULT_FILE_NAME(fileStem, traj);
+    std::cout << "Saving correlator to '" << filename << "'" << std::endl;
+    makeFileDir(dir);
+    ResultWriter writer(filename);
+    write(writer, fileStem, result);
+}
+
+std::set<unsigned int> parseTimeRange(const std::string str, const unsigned int nt)
+{
+    std::regex               rex("([0-9]+)|(([0-9]+)\\.\\.([0-9]+))");
+    std::smatch              sm;
+    std::vector<std::string> rstr = strToVec<std::string>(str);
+    std::set<unsigned int>   tSet;
+
+    for (auto &s: rstr)
+    {
+        std::regex_match(s, sm, rex);
+        if (sm[1].matched)
+        {
+            unsigned int t;
+            
+            t = std::stoi(sm[1].str());
+            if (t >= nt)
+            {
+                HADRONS_ERROR(Range, "time out of range (from expression '" + str + "')");
+            }
+            tSet.insert(t);
+        }
+        else if (sm[2].matched)
+        {
+            unsigned int ta, tb;
+
+            ta = std::stoi(sm[3].str());
+            tb = std::stoi(sm[4].str());
+            if ((ta >= nt) or (tb >= nt))
+            {
+                HADRONS_ERROR(Range, "time out of range (from expression '" + str + "')");
+            }
+            for (unsigned int ti = ta; ti <= tb; ++ti)
+            {
+                tSet.insert(ti);
+            }
+        }
+    }
+
+    return tSet;
+}
+
+struct Sec
+{
+    Sec(const double usec)
+    {
+        seconds = usec/1.0e6;
+    }
+    
+    double seconds;
+};
+
+inline std::ostream & operator<< (std::ostream& s, const Sec &&sec)
+{
+    s << std::setw(10) << sec.seconds << " sec";
+
+    return s;
+}
+
+struct Flops
+{
+    Flops(const double flops, const double fusec)
+    {
+        gFlopsPerSec = flops/fusec/1.0e3;
+    }
+    
+    double gFlopsPerSec;
+};
+
+inline std::ostream & operator<< (std::ostream& s, const Flops &&f)
+{
+    s << std::setw(10) << f.gFlopsPerSec << " GFlop/s";
+
+    return s;
+}
+
+struct Bytes
+{
+    Bytes(const double bytes, const double busec)
+    {
+        gBytesPerSec = bytes/busec*1.0e6/1024/1024/1024;
+    }
+    
+    double gBytesPerSec;
+};
+
+inline std::ostream & operator<< (std::ostream& s, const Bytes &&b)
+{
+    s << std::setw(10) << b.gBytesPerSec << " GB/s";
+
+    return s;
+}
+
+int main(int argc, char* argv[])
+{
+    // parse command line
+    std::string   parFilename;
+
+    if (argc != 2)
+    {
+        std::cerr << "usage: " << argv[0] << " <parameter file>";
+        std::cerr << std::endl;
+        
+        return EXIT_FAILURE;
+    }
+    parFilename = argv[1];
+
+    // parse parameter file
+    ContractorPar par;
+    unsigned int  nMat, nCont;
+    XmlReader     reader(parFilename);
+
+    read(reader, "global",    par.global);
+    read(reader, "a2aMatrix", par.a2aMatrix);
+    read(reader, "product",   par.product);
+    nMat  = par.a2aMatrix.size();
+    nCont = par.product.size();
+
+    // create diskvectors
+    std::map<std::string, EigenDiskVector<ComplexD>> a2aMat;
+    unsigned int                                     cacheSize;
+
+    for (auto &p: par.a2aMatrix)
+    {
+        std::string dirName = par.global.diskVectorDir + "/" + p.name;
+
+        a2aMat.emplace(p.name, EigenDiskVector<ComplexD>(dirName, par.global.nt, p.cacheSize));
+    }
+
+    // trajectory loop
+    for (unsigned int traj = par.global.trajCounter.start; 
+         traj < par.global.trajCounter.end; traj += par.global.trajCounter.step)
+    {
+        std::cout << ":::::::: Trajectory " << traj << std::endl;
+
+        // load data
+        for (auto &p: par.a2aMatrix)
+        {
+            std::string filename = p.file;
+            double      t, size;
+
+            tokenReplace(filename, "traj", traj);
+            std::cout << "======== Loading '" << filename << "'" << std::endl;
+
+            A2AMatrixIo<HADRONS_A2AM_IO_TYPE> a2aIo(filename, p.dataset, par.global.nt);
+
+            a2aIo.load(a2aMat.at(p.name), &t);
+            std::cout << "Read " << a2aIo.getSize() << " bytes in " << t/1.0e6 
+                    << " sec, " << a2aIo.getSize()/t*1.0e6/1024/1024 << " MB/s" << std::endl;
+        }
+
+        // contract
+        EigenDiskVector<ComplexD>::Matrix buf;
+
+        for (auto &p: par.product)
+        {
+            std::vector<std::string>               term = strToVec<std::string>(p.terms);
+            std::vector<std::set<unsigned int>>    times;
+            std::vector<std::vector<unsigned int>> timeSeq;
+            std::set<unsigned int>                 translations;
+            std::vector<A2AMatrixTr<ComplexD>>     lastTerm(par.global.nt);
+            A2AMatrix<ComplexD>                    prod, buf, tmp;
+            TimerArray                             tAr;
+            double                                 fusec, busec, flops, bytes, tusec;
+            Contractor::CorrelatorResult           result;             
+
+            tAr.startTimer("Total");
+            std::cout << "======== Contraction tr(";
+            for (unsigned int g = 0; g < term.size(); ++g)
+            {
+                std::cout << term[g] << ((g == term.size() - 1) ? ')' : '*');
+            }
+            std::cout << std::endl;
+            if (term.size() != p.times.size() + 1)
+            {
+                HADRONS_ERROR(Size, "number of terms (" + std::to_string(term.size()) 
+                            + ") different from number of times (" 
+                            + std::to_string(p.times.size() + 1) + ")");
+            }
+            for (auto &s: p.times)
+            {
+                times.push_back(parseTimeRange(s, par.global.nt));
+            }
+            for (auto &m: par.a2aMatrix)
+            {
+                if (std::find(result.a2aMatrix.begin(), result.a2aMatrix.end(), m) == result.a2aMatrix.end())
+                {
+                    result.a2aMatrix.push_back(m);
+                    tokenReplace(result.a2aMatrix.back().file, "traj", traj);
+                }
+            }
+            result.contraction = p;
+            result.correlator.resize(par.global.nt, 0.);
+
+            translations = parseTimeRange(p.translations, par.global.nt);
+            makeTimeSeq(timeSeq, times);
+            std::cout << timeSeq.size()*translations.size()*(term.size() - 2) << " A*B, "
+                    << timeSeq.size()*translations.size()*par.global.nt << " tr(A*B)"
+                    << std::endl;
+
+            std::cout << "* Caching transposed last term" << std::endl;
+            for (unsigned int t = 0; t < par.global.nt; ++t)
+            {
+                tAr.startTimer("Disk vector overhead");
+                const A2AMatrix<ComplexD> &ref = a2aMat.at(term.back())[t];
+                tAr.stopTimer("Disk vector overhead");
+
+                tAr.startTimer("Transpose caching");
+                lastTerm[t].resize(ref.rows(), ref.cols());
+                parallel_for (unsigned int j = 0; j < ref.cols(); ++j)
+                for (unsigned int i = 0; i < ref.rows(); ++i)
+                {
+                    lastTerm[t](i, j) = ref(i, j);
+                }
+                tAr.stopTimer("Transpose caching");
+            }
+            bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols()*sizeof(ComplexD);
+            std::cout << Sec(tAr.getDTimer("Transpose caching")) << " " 
+                      << Bytes(bytes, tAr.getDTimer("Transpose caching")) << std::endl;
+            for (unsigned int i = 0; i < timeSeq.size(); ++i)
+            {
+                unsigned int dti = 0;
+                auto         &t = timeSeq[i];
+
+                result.times = t;
+                for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                {
+                    result.correlator[tLast] = 0.;
+                }
+                for (auto &dt: translations)
+                {
+                    std::cout << "* Step " << i*translations.size() + dti + 1
+                            << "/" << timeSeq.size()*translations.size()
+                            << " -- positions= " << t << ", dt= " << dt << std::endl;
+                    if (term.size() > 2)
+                    {
+                        std::cout << std::setw(8) << "products";
+                    }
+                    flops  = 0.;
+                    bytes  = 0.;
+                    fusec  = tAr.getDTimer("A*B algebra");
+                    busec  = tAr.getDTimer("A*B total");
+                    tAr.startTimer("Linear algebra");
+                    tAr.startTimer("Disk vector overhead");
+                    prod = a2aMat.at(term[0])[TIME_MOD(t[0] + dt)];
+                    tAr.stopTimer("Disk vector overhead");
+                    for (unsigned int j = 1; j < term.size() - 1; ++j)
+                    {
+                        tAr.startTimer("Disk vector overhead");
+                        const A2AMatrix<ComplexD> &ref = a2aMat.at(term[j])[TIME_MOD(t[j] + dt)];
+                        tAr.stopTimer("Disk vector overhead");
+                        
+                        tAr.startTimer("A*B total");
+                        tAr.startTimer("A*B algebra");
+                        A2AContraction::mul(tmp, prod, ref);
+                        tAr.stopTimer("A*B algebra");
+                        flops += A2AContraction::mulFlops(prod, ref);
+                        prod   = tmp;
+                        tAr.stopTimer("A*B total");
+                        bytes += 3.*tmp.rows()*tmp.cols()*sizeof(ComplexD);
+                    }
+                    if (term.size() > 2)
+                    {
+                        std::cout << Sec(tAr.getDTimer("A*B total") - busec) << " "
+                                << Flops(flops, tAr.getDTimer("A*B algebra") - fusec) << " " 
+                                << Bytes(bytes, tAr.getDTimer("A*B total") - busec) << std::endl;
+                    }
+                    std::cout << std::setw(8) << "traces";
+                    flops  = 0.;
+                    bytes  = 0.;
+                    fusec  = tAr.getDTimer("tr(A*B)");
+                    busec  = tAr.getDTimer("tr(A*B)");
+                    for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                    {
+                        tAr.startTimer("tr(A*B)");
+                        A2AContraction::accTrMul(result.correlator[TIME_MOD(tLast - dt)], prod, lastTerm[tLast]);
+                        tAr.stopTimer("tr(A*B)");
+                        flops += A2AContraction::accTrMulFlops(prod, lastTerm[tLast]);
+                        bytes += 2.*prod.rows()*prod.cols()*sizeof(ComplexD);
+                    }
+                    tAr.stopTimer("Linear algebra");
+                    std::cout << Sec(tAr.getDTimer("tr(A*B)") - busec) << " "
+                            << Flops(flops, tAr.getDTimer("tr(A*B)") - fusec) << " " 
+                            << Bytes(bytes, tAr.getDTimer("tr(A*B)") - busec) << std::endl;
+                    if (!p.translationAverage)
+                    {
+                        saveCorrelator(result, par.global.output, dt, traj);
+                        for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                        {
+                            result.correlator[tLast] = 0.;
+                        }
+                    }
+                    dti++;
+                }
+                if (p.translationAverage)
+                {
+                    for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                    {
+                        result.correlator[tLast] /= translations.size();
+                    }
+                    saveCorrelator(result, par.global.output, 0, traj);
+                }
+            }
+            tAr.stopTimer("Total");
+            printTimeProfile(tAr.getTimings(), tAr.getTimer("Total"));
+        }
+    }
+    
+    return EXIT_SUCCESS;
+}
@@ -0,0 +1,12 @@
+#ifndef  Hadrons_Contractor_hpp_
+#define Hadrons_Contractor_hpp_
+
+#include <Hadrons/Global.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_Contractor_hpp_
@@ -0,0 +1,434 @@
+#include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+#ifdef USE_MKL
+#include "mkl.h"
+#include "mkl_cblas.h"
+#endif
+
+using namespace Grid;
+using namespace Hadrons;
+
+#ifdef GRID_COMMS_MPI3
+#define GET_RANK(rank, nMpi) \
+MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
+MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
+#define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
+#define INIT() MPI_Init(NULL, NULL)
+#define FINALIZE() MPI_Finalize()
+#else
+#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
+#define BARRIER()
+#define INIT()
+#define FINALIZE()
+#endif
+
+template <typename Function, typename MatLeft, typename MatRight>
+inline void trBenchmark(const std::string name, const MatLeft &left,
+                        const MatRight &right, const ComplexD ref, Function fn)
+{
+    double       t, flops, bytes, n = left[0].rows()*left[0].cols();
+    unsigned int nMat = left.size();
+    int          nMpi, rank;
+    ComplexD     buf;
+
+    t = 0.;
+    GET_RANK(rank, nMpi);
+    t = -usecond();
+    BARRIER();
+    for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
+    {
+        fn(buf, left[i], right[i]);      
+    }
+    BARRIER();
+    t += usecond();
+    flops = nMat*(6.*n + 2.*(n - 1.));
+    bytes = nMat*(2.*n*sizeof(ComplexD));
+
+    if (rank == 0)
+    {
+        std::cout << std::setw(34) << name << ": diff= "
+                  << std::setw(12) << std::norm(buf-ref)
+                  << std::setw(10) << t/1.0e6 << " sec "
+                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+                  << std::endl;
+    }
+    ::sleep(1);
+}
+
+template <typename Function, typename MatV, typename Mat>
+inline void mulBenchmark(const std::string name, const MatV &left,
+                         const MatV &right, const Mat &ref, Function fn)
+{
+    double       t, flops, bytes;
+    double       nr = left[0].rows(), nc = left[0].cols(), n = nr*nc;
+    unsigned int nMat = left.size();
+    int          nMpi, rank;
+    Mat          buf(left[0].rows(), left[0].rows());
+
+    t = 0.;
+    GET_RANK(rank, nMpi);
+    t = -usecond();
+    BARRIER();
+    for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
+    {
+        fn(buf, left[i], right[i]);
+    }
+    BARRIER();
+    t += usecond();
+    flops = nMat*(nr*nr*(6.*nc + 2.*(nc - 1.)));
+    bytes = nMat*(2*nc*nr*sizeof(ComplexD));
+
+    if (rank == 0)
+    {
+        std::cout << std::setw(34) << name << ": diff= "
+                  << std::setw(12) << (buf-ref).squaredNorm()
+                  << std::setw(10) << t/1.0e6 << " sec "
+                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+                  << std::endl;
+    }
+    ::sleep(1);
+}
+
+#ifdef USE_MKL
+template <typename MatLeft, typename MatRight>
+static inline void zdotuRow(ComplexD &res, const unsigned int aRow,
+                            const MatLeft &a, const MatRight &b)
+{
+    const ComplexD *aPt, *bPt;
+    unsigned int   aInc, bInc;
+
+    if (MatLeft::Options == Eigen::RowMajor)
+    {
+        aPt  = a.data() + aRow*a.cols();
+        aInc = 1;
+    }
+    else if (MatLeft::Options == Eigen::ColMajor)
+    {
+        aPt  = a.data() + aRow;
+        aInc = a.rows();
+    }
+    if (MatRight::Options == Eigen::RowMajor)
+    {
+        bPt  = b.data() + aRow;
+        bInc = b.cols();
+    }
+    else if (MatRight::Options == Eigen::ColMajor)
+    {
+        bPt  = b.data() + aRow*b.rows();
+        bInc = 1;
+    }
+    cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+}
+
+template <typename MatLeft, typename MatRight>
+static inline void zdotuCol(ComplexD &res, const unsigned int aCol,
+                            const MatLeft &a, const MatRight &b)
+{
+    const ComplexD *aPt, *bPt;
+    unsigned int   aInc, bInc;
+
+    if (MatLeft::Options == Eigen::RowMajor)
+    {
+        aPt  = a.data() + aCol;
+        aInc = a.cols();
+    }
+    else if (MatLeft::Options == Eigen::ColMajor)
+    {
+        aPt  = a.data() + aCol*a.rows();
+        aInc = 1;
+    }
+    if (MatRight::Options == Eigen::RowMajor)
+    {
+        bPt  = b.data() + aCol*b.cols();
+        bInc = 1;
+    }
+    else if (MatRight::Options == Eigen::ColMajor)
+    {
+        bPt  = b.data() + aCol;
+        bInc = b.rows();
+    }
+    cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+}
+#endif
+
+template <typename MatLeft, typename MatRight>
+void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
+{
+    std::vector<MatLeft>  left;
+    std::vector<MatRight> right;
+    MatRight              buf;
+    ComplexD              ref;
+    int                   rank, nMpi;
+
+    left.resize(nMat, MatLeft::Random(ni, nj));
+    right.resize(nMat, MatRight::Random(nj, ni));
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "==== tr(A*B) benchmarks" << std::endl;
+        std::cout << "A matrices use ";
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << "B matrices use ";
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    BARRIER();
+    ref = (left.back()*right.back()).trace();
+    trBenchmark("Hadrons A2AContraction::accTrMul", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = 0.;
+        A2AContraction::accTrMul(res, a, b);
+    });
+    trBenchmark("Naive loop rows first", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        auto nr = a.rows(), nc = a.cols();
+        
+        res = 0.;
+        parallel_for (unsigned int i = 0; i < nr; ++i)
+        {
+            ComplexD tmp = 0.;
+
+            for (unsigned int j = 0; j < nc; ++j)
+            {
+                tmp += a(i, j)*b(j, i);
+            }
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Naive loop cols first", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        auto nr = a.rows(), nc = a.cols();
+        
+        res = 0.;
+        parallel_for (unsigned int j = 0; j < nc; ++j)
+        {
+            ComplexD tmp = 0.;
+
+            for (unsigned int i = 0; i < nr; ++i)
+            {
+                tmp += a(i, j)*b(j, i);
+            }        
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Eigen tr(A*B)", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = (a*b).trace();
+    });
+    trBenchmark("Eigen row-wise dot", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+        {
+            ComplexD tmp;
+
+            tmp = a.row(r).conjugate().dot(b.col(r));
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Eigen col-wise dot", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+        {
+            ComplexD tmp;
+
+            tmp = a.col(c).conjugate().dot(b.row(c));
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Eigen Hadamard", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = a.cwiseProduct(b.transpose()).sum();
+    });
+#ifdef USE_MKL
+    trBenchmark("MKL row-wise zdotu", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+        {
+            ComplexD tmp;
+
+            zdotuRow(tmp, r, a, b);
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("MKL col-wise zdotu", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+        {
+            ComplexD tmp;
+
+            zdotuCol(tmp, c, a, b);
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+#endif
+    BARRIER();
+    if (rank == 0)
+    {
+        std::cout << std::endl;
+    }
+}
+
+template <typename Mat>
+void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
+{
+    std::vector<Mat> left, right;
+    Mat              ref;
+    int              rank, nMpi;
+
+    left.resize(nMat, Mat::Random(ni, nj));
+    right.resize(nMat, Mat::Random(nj, ni));
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "==== A*B benchmarks" << std::endl;
+        std::cout << "all matrices use ";
+        if (Mat::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (Mat::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    BARRIER();
+    ref = left.back()*right.back();
+    mulBenchmark("Hadrons A2AContraction::mul", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    { 
+        A2AContraction::mul(res, a, b);
+    });
+    mulBenchmark("Eigen A*B", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    { 
+        res = a*b;
+    });
+#ifdef USE_MKL
+    mulBenchmark("MKL A*B", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    {
+        const ComplexD one(1., 0.), zero(0., 0.);
+        if (Mat::Options == Eigen::RowMajor)
+        {
+            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat::Options == Eigen::ColMajor)
+        {
+            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    });
+#endif
+    BARRIER();
+    if (rank == 0)
+    {
+        std::cout << std::endl;
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    // parse command line
+    Eigen::Index ni, nj, nMat;
+    int          nMpi, rank;
+
+    if (argc != 4)
+    {
+        std::cerr << "usage: " << argv[0] << " <Ni> <Nj> <#matrices>";
+        std::cerr << std::endl;
+        
+        return EXIT_FAILURE;
+    }
+    ni   = std::stoi(argv[1]);
+    nj   = std::stoi(argv[2]);
+    nMat = std::stoi(argv[3]);
+
+    INIT();
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "\n*** ALL-TO-ALL MATRIX CONTRACTION BENCHMARK ***\n" << std::endl;
+        std::cout << nMat << " couples of " << ni << "x" << nj << " matrices\n" << std::endl;
+
+        std::cout << nMpi << " MPI processes" << std::endl;
+#ifdef GRID_OMP
+        #pragma omp parallel
+        {
+            #pragma omp single
+            std::cout << omp_get_num_threads() << " threads\n" << std::endl; 
+        }
+#else
+        std::cout << "Single-threaded\n" << std::endl; 
+#endif
+
+#ifdef EIGEN_USE_MKL_ALL
+        std::cout << "Eigen uses the MKL" << std::endl;
+#endif
+        std::cout << "Eigen uses " << Eigen::nbThreads() << " threads" << std::endl;
+#ifdef USE_MKL
+        std::cout << "MKL   uses " << mkl_get_max_threads() << " threads" << std::endl;
+#endif
+        std::cout << std::endl;
+    }
+
+    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    fullMulBenchmark<A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullMulBenchmark<A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    FINALIZE();
+
+    return EXIT_SUCCESS;
+}
@@ -0,0 +1,217 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Utilities/EigenPackCast.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/EigenPack.hpp>
+#include <Hadrons/Environment.hpp>
+
+using namespace Grid;
+using namespace QCD;
+using namespace Hadrons;
+
+template <typename FOut, typename FIn>
+void convert(const std::string outFilename, const std::string inFilename, 
+             const unsigned int Ls, const bool rb, const unsigned int size, 
+             const bool multiFile, const bool testRead)
+{
+    assert(outFilename != inFilename);
+    
+    typedef EigenPack<FOut>            EPOut;
+    typedef EigenPack<FIn>             EPIn;
+    typedef typename FOut::vector_type VTypeOut;
+    typedef typename FIn::vector_type  VTypeIn;
+
+    std::shared_ptr<GridCartesian>         gInBase, gOutBase, gIn5, gOut5;
+    std::shared_ptr<GridRedBlackCartesian> rbgIn, rbgOut;
+    GridBase                               *gIn, *gOut;
+
+    auto         dim     = GridDefaultLatt();
+    unsigned int nd      = dim.size();
+    auto         simdOut = GridDefaultSimd(nd, VTypeOut::Nsimd());
+    auto         simdIn  = GridDefaultSimd(nd, VTypeIn::Nsimd());
+
+    gOutBase.reset(SpaceTimeGrid::makeFourDimGrid(dim, simdOut, GridDefaultMpi()));
+    gInBase.reset(SpaceTimeGrid::makeFourDimGrid(dim, simdIn, GridDefaultMpi()));
+    if (rb)
+    {
+        if (Ls > 1)
+        {
+            rbgOut.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gOutBase.get()));
+            rbgIn.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gInBase.get()));
+        }
+        else
+        {
+            rbgOut.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gOutBase.get()));
+            rbgIn.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gInBase.get()));
+        }
+        gOut = rbgOut.get();
+        gIn  = rbgIn.get();
+    }
+    else
+    {
+        if (Ls > 1)
+        {
+            gOut5.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gOutBase.get()));
+            gIn5.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gInBase.get()));
+            gOut = gOut5.get();
+            gIn  = gIn5.get();
+        }
+        else
+        {
+            gOut = gOutBase.get();
+            gIn  = gInBase.get();
+        }
+    }
+
+    FOut         bufOut(gOut);
+    FIn          bufIn(gIn), testIn(gIn);
+    ScidacWriter binWriter(gOut->IsBoss());
+    ScidacReader binReader;
+    PackRecord   record;
+    RealD        eval;
+
+    LOG(Message) << "==== EIGENPACK CONVERSION" << std::endl;
+    LOG(Message) << "Lattice       : " << gIn->GlobalDimensions() << std::endl;
+    LOG(Message) << "Checkerboarded: " << (rb ? "yes" : "no") << std::endl;
+    LOG(Message) << "In path       : " << inFilename  << std::endl;
+    LOG(Message) << "In type       : " << typeName<FIn>() << std::endl;
+    LOG(Message) << "Out path      : " << outFilename << std::endl;
+    LOG(Message) << "Out type      : " << typeName<FOut>() << std::endl;
+    LOG(Message) << "#vectors      : " << size << std::endl;
+    LOG(Message) << "Multifile     : " << (multiFile ? "yes" : "no") << std::endl;
+    LOG(Message) << "Test read     : " << (testRead ? "yes" : "no") << std::endl;
+    if (multiFile)
+    {
+        for(unsigned int k = 0; k < size; ++k)
+        {
+            std::string  outV = outFilename + "/v" + std::to_string(k) + ".bin";
+            std::string  inV  = inFilename + "/v" + std::to_string(k) + ".bin";
+
+            LOG(Message) << "==== Converting vector " << k << std::endl;
+            LOG(Message) << "In : " << inV  << std::endl;
+            LOG(Message) << "Out: " << outV << std::endl;
+            // conversion
+            LOG(Message) << "-- Doing conversion" << std::endl;
+            makeFileDir(outV, gOut);
+            binWriter.open(outV);
+            binReader.open(inV);
+            EigenPackIo::readHeader(record, binReader);
+            EigenPackIo::writeHeader(binWriter, record);
+            EigenPackIo::readElement<FIn>(bufIn, eval, k, binReader);
+            EigenPackIo::writeElement<FIn, FOut>(binWriter, bufIn, eval, k, &bufOut, &testIn);
+            binWriter.close();
+            binReader.close();
+            // read test
+            if (testRead)
+            {
+                LOG(Message) << "-- Test read" << std::endl;
+                binReader.open(outV);
+                EigenPackIo::readElement<FOut>(bufOut, eval, k, binReader);
+                binReader.close();
+            }
+        }
+    }
+    else
+    {
+        // conversion
+        LOG(Message) << "-- Doing conversion" << std::endl;
+        makeFileDir(outFilename, gOut);
+        binWriter.open(outFilename);
+        binReader.open(inFilename);
+        EigenPackIo::readHeader(record, binReader);
+        EigenPackIo::writeHeader(binWriter, record);
+        for(unsigned int k = 0; k < size; ++k)
+        {
+            EigenPackIo::readElement<FIn>(bufIn, eval, k, binReader);
+            EigenPackIo::writeElement<FIn, FOut>(binWriter, bufIn, eval, k, &bufOut, &testIn);
+        }
+        binWriter.close();
+        binReader.close();
+        // read test
+        if (testRead)
+        {
+            LOG(Message) << "-- Test read" << std::endl;
+            binReader.open(outFilename);
+            EigenPackIo::readHeader(record, binReader);
+            for(unsigned int k = 0; k < size; ++k)
+            {
+                EigenPackIo::readElement<FOut>(bufOut, eval, k, binReader);
+            }
+            binReader.close();
+        }
+    }
+}
+
+#ifndef FOUT
+#warning "FOUT undefined (set to WilsonImplF::FermionField by default)"
+#define FOUT WilsonImplF::FermionField
+#endif
+#ifndef FIN
+#warning "FIN undefined (set to WilsonImplD::FermionField by default)"
+#define FIN WilsonImplD::FermionField
+#endif
+
+int main(int argc, char *argv[])
+{
+    // parse command line
+    std::string  outFilename, inFilename;
+    unsigned int size, Ls;
+    bool         rb, multiFile, testRead;
+    
+    if (argc < 8)
+    {
+        std::cerr << "usage: " << argv[0] << " <out eigenpack> <in eigenpack> <Ls> <red-black {0|1}> <#vector> <multifile {0|1}> <test read {0|1}> [Grid options]";
+        std::cerr << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+    outFilename = argv[1];
+    inFilename  = argv[2];
+    Ls          = std::stoi(std::string(argv[3]));
+    rb          = (std::string(argv[4]) != "0");
+    size        = std::stoi(std::string(argv[5]));
+    multiFile   = (std::string(argv[6]) != "0");
+    testRead    = (std::string(argv[7]) != "0");
+    
+    // initialization
+    Grid_init(&argc, &argv);
+    initLogger();
+
+    // execution
+    try
+    {
+        convert<FOUT, FIN>(outFilename, inFilename, Ls, rb, size, multiFile, testRead);
+    }
+    catch (const std::exception& e)
+    {
+        Exceptions::abort(e);
+    }
+
+    // epilogue
+    LOG(Message) << "Grid is finalizing now" << std::endl;
+    Grid_finalize();
+    
+    return EXIT_SUCCESS;
+}
@@ -0,0 +1,80 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Utilities/HadronsXmlRun.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Application.hpp>
+
+using namespace Grid;
+ 
+using namespace Hadrons;
+
+int main(int argc, char *argv[])
+{
+    // parse command line
+    std::string parameterFileName, scheduleFileName = "";
+    
+    if (argc < 2)
+    {
+        std::cerr << "usage: " << argv[0] << " <parameter file> [<precomputed schedule>] [Grid options]";
+        std::cerr << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+    parameterFileName = argv[1];
+    if (argc > 2)
+    {
+        if (argv[2][0] != '-')
+        {
+            scheduleFileName = argv[2];
+        }
+    }
+    
+    // initialization
+    Grid_init(&argc, &argv);
+    
+    // execution
+    try
+    {
+    Application application(parameterFileName);
+    
+    application.parseParameterFile(parameterFileName);
+    if (!scheduleFileName.empty())
+    {
+        application.loadSchedule(scheduleFileName);
+    }
+    application.run();
+    }
+    catch (const std::exception& e)
+    {
+        Exceptions::abort(e);
+    }
+    
+    // epilogue
+    LOG(Message) << "Grid is finalizing now" << std::endl;
+    Grid_finalize();
+    
+    return EXIT_SUCCESS;
+}
@@ -0,0 +1,14 @@
+bin_PROGRAMS = HadronsXmlRun HadronsFermionEP64To32 HadronsContractor HadronsContractorBenchmark
+
+HadronsXmlRun_SOURCES = HadronsXmlRun.cc
+HadronsXmlRun_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
+
+HadronsFermionEP64To32_SOURCES  = EigenPackCast.cc
+HadronsFermionEP64To32_CXXFLAGS = $(AM_CXXFLAGS) -DFIN=WilsonImplD::FermionField -DFOUT=WilsonImplF::FermionField
+HadronsFermionEP64To32_LDADD    = ../libHadrons.a ../../Grid/libGrid.a
+
+HadronsContractor_SOURCES = Contractor.cc
+HadronsContractor_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
+
+HadronsContractorBenchmark_SOURCES = ContractorBenchmark.cc
+HadronsContractorBenchmark_LDADD   = ../libHadrons.a ../../Grid/libGrid.a