Hadrons: faster A2A matrix load

Hadrons: contractor fixes
Hadrons: first stab at MPI contractor
2026-05-02 16:34:12 +01:00 · 2019-01-11 16:12:49 +00:00 · 2019-01-11 16:12:16 +00:00 · 2019-01-10 16:29:57 +00:00 · 2019-01-02 14:40:31 +00:00 · 2019-01-02 14:39:59 +00:00
86 changed files with 6516 additions and 432 deletions
@@ -48,6 +48,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 #include <Grid/algorithms/iterative/MinimalResidual.h>
 #include <Grid/algorithms/iterative/GeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/CoarsenedMatrix.h>
 #include <Grid/algorithms/FFT.h>
@@ -211,6 +211,7 @@ namespace Grid {
      for(int b=0;b<nn;b++){
 	subspace[b] = zero;
 	gaussian(RNG,noise);
 	scale = std::pow(norm2(noise),-0.5); 
 	noise=noise*scale;
@@ -296,12 +297,57 @@ namespace Grid {
    };
    RealD Mdag (const CoarseVector &in, CoarseVector &out){
-      return M(in,out);
+      // // corresponds to Petrov-Galerkin coarsening
      // return M(in,out);
      // corresponds to Galerkin coarsening
      CoarseVector tmp(Grid());
      G5C(tmp, in);
      M(tmp, out);
      G5C(out, out);
      return norm2(out);
    };
-    // Defer support for further coarsening for now
+    void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
-    void Mdiag    (const CoarseVector &in,  CoarseVector &out){};
+
-    void Mdir     (const CoarseVector &in,  CoarseVector &out,int dir, int disp){};
+      conformable(_grid,in._grid);
      conformable(in._grid,out._grid);
      SimpleCompressor<siteVector> compressor;
      Stencil.HaloExchange(in,compressor);
      auto point = [dir, disp](){
        if(dir == 0 and disp == 0)
          return 8;
        else
          return (4 * dir + 1 - disp) / 2;
      }();
      parallel_for(int ss=0;ss<Grid()->oSites();ss++){
        siteVector res = zero;
        siteVector nbr;
        int ptype;
        StencilEntry *SE;
        SE=Stencil.GetEntry(ptype,point,ss);
        if(SE->_is_local&&SE->_permute) {
          permute(nbr,in._odata[SE->_offset],ptype);
        } else if(SE->_is_local) {
          nbr = in._odata[SE->_offset];
        } else {
          nbr = Stencil.CommBuf()[SE->_offset];
        }
        res = res + A[point]._odata[ss]*nbr;
        vstream(out._odata[ss],res);
      }
    };
    void Mdiag(const CoarseVector &in, CoarseVector &out){
      Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
    };
    CoarsenedMatrix(GridCartesian &CoarseGrid) 	: 
@@ -417,7 +463,7 @@ namespace Grid {
      std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
      //      ForceHermitian();
-      AssertHermitian();
+      // AssertHermitian();
      // ForceDiagonal();
    }
    void ForceDiagonal(void) {
@@ -0,0 +1,244 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field>
 class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the CAGMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  CommunicationAvoidingGeneralisedMinimalResidual(RealD   tol,
                                                  Integer maxit,
                                                  Integer restart_length,
                                                  bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.) {};
  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
    std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl;
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Field r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual:   src " << ssq   << std::endl;
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "CommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
    RealD cp = 0;
    Field w(src._grid);
    Field r(src._grid);
    // this should probably be made a class member so that it is only allocated once, not in every restart
    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(v, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
    MatrixTimer.Start();
    LinOp.Op(v[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + v[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
@@ -0,0 +1,256 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field>
 class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the FCAGMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch PrecTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  LinearFunction<Field> &Preconditioner;
  FlexibleCommunicationAvoidingGeneralisedMinimalResidual(RealD   tol,
                                                          Integer maxit,
                                                          LinearFunction<Field> &Prec,
                                                          Integer restart_length,
                                                          bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.)
      , Preconditioner(Prec) {};
  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
    std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl;
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Field r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual:   src " << ssq   << std::endl;
    PrecTimer.Reset();
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Precon  " <<         PrecTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
    RealD cp = 0;
    Field w(src._grid);
    Field r(src._grid);
    // these should probably be made class members so that they are only allocated once, not in every restart
    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, z, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(z, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
    PrecTimer.Start();
    Preconditioner(v[iter], z[iter]);
    PrecTimer.Stop();
    MatrixTimer.Start();
    LinOp.Op(z[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + z[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
@@ -0,0 +1,254 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field>
 class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the FGMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch PrecTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  LinearFunction<Field> &Preconditioner;
  FlexibleGeneralisedMinimalResidual(RealD   tol,
                                     Integer maxit,
                                     LinearFunction<Field> &Prec,
                                     Integer restart_length,
                                     bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.)
      , Preconditioner(Prec) {};
  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Field r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual:   src " << ssq   << std::endl;
    PrecTimer.Reset();
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "FlexibleGeneralisedMinimalResidual: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: Precon  " <<         PrecTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
    RealD cp = 0;
    Field w(src._grid);
    Field r(src._grid);
    // these should probably be made class members so that they are only allocated once, not in every restart
    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, z, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(z, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
    PrecTimer.Start();
    Preconditioner(v[iter], z[iter]);
    PrecTimer.Stop();
    MatrixTimer.Start();
    LinOp.Op(z[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + z[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
@@ -0,0 +1,242 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/GeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field>
 class GeneralisedMinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the GMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  GeneralisedMinimalResidual(RealD   tol,
                             Integer maxit,
                             Integer restart_length,
                             bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.) {};
  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Field r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "GeneralisedMinimalResidual:   src " << ssq   << std::endl;
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "GeneralisedMinimalResidual: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
    RealD cp = 0;
    Field w(src._grid);
    Field r(src._grid);
    // this should probably be made a class member so that it is only allocated once, not in every restart
    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(v, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
    MatrixTimer.Start();
    LinOp.Op(v[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + v[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
@@ -0,0 +1,156 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/MinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_MINIMAL_RESIDUAL_H
 #define GRID_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field> class MinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
                          // Defaults true.
  RealD   Tolerance;
  Integer MaxIterations;
  RealD   overRelaxParam;
  Integer IterationsToComplete; // Number of iterations the MR took to finish.
                                // Filled in upon completion
  MinimalResidual(RealD tol, Integer maxit, Real ovrelparam = 1.0, bool err_on_no_conv = true)
    : Tolerance(tol), MaxIterations(maxit), overRelaxParam(ovrelparam), ErrorOnNoConverge(err_on_no_conv){};
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    Complex a, c;
    Real    d;
    Field Mr(src);
    Field r(src);
    // Initial residual computation & set up
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Linop.Op(psi, Mr);
    r = src - Mr;
    RealD cp = norm2(r);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "MinimalResidual:   src " << ssq << std::endl;
    std::cout << GridLogIterative << "MinimalResidual:    mp " << d << std::endl;
    std::cout << GridLogIterative << "MinimalResidual:  cp,r " << cp << std::endl;
    if (cp <= rsq) {
      return;
    }
    std::cout << GridLogIterative << "MinimalResidual: k=0 residual " << cp << " target " << rsq << std::endl;
    GridStopWatch LinalgTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
      MatrixTimer.Start();
      Linop.Op(r, Mr);
      MatrixTimer.Stop();
      LinalgTimer.Start();
      c = innerProduct(Mr, r);
      d = norm2(Mr);
      a = c / d;
      a = a * overRelaxParam;
      psi = psi + r * a;
      r = r - Mr * a;
      cp = norm2(r);
      LinalgTimer.Stop();
      std::cout << GridLogIterative << "MinimalResidual: Iteration " << k
                << " residual " << cp << " target " << rsq << std::endl;
      std::cout << GridLogDebug << "a = " << a << " c = " << c << " d = " << d << std::endl;
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        Linop.Op(psi, Mr);
        r = src - Mr;
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "MinimalResidual Converged on iteration " << k
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "MR Time elapsed: Total   " << SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MR Time elapsed: Matrix  " << MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MR Time elapsed: Linalg  " << LinalgTimer.Elapsed() << std::endl;
        if (ErrorOnNoConverge)
          assert(true_residual / Tolerance < 10000.0);
        IterationsToComplete = k;
        return;
      }
    }
    std::cout << GridLogMessage << "MinimalResidual did NOT converge"
              << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
    IterationsToComplete = k;
  }
 };
 } // namespace Grid
 #endif
@@ -0,0 +1,273 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
 class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the MPFGMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch PrecTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  GridStopWatch ChangePrecTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  GridBase* SinglePrecGrid;
  LinearFunction<FieldF> &Preconditioner;
  MixedPrecisionFlexibleGeneralisedMinimalResidual(RealD   tol,
                                                   Integer maxit,
                                                   GridBase * sp_grid,
                                                   LinearFunction<FieldF> &Prec,
                                                   Integer restart_length,
                                                   bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.)
      , SinglePrecGrid(sp_grid)
      , Preconditioner(Prec) {};
  void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    FieldD r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl;
    std::cout << GridLogIterative << "MPFGMRES:   src " << ssq   << std::endl;
    PrecTimer.Reset();
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    ChangePrecTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "MPFGMRES: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Total      " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Precon     " <<         PrecTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Matrix     " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Linalg     " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: QR         " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: CompSol    " << CompSolutionTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: PrecChange " <<   ChangePrecTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
    RealD cp = 0;
    FieldD w(src._grid);
    FieldD r(src._grid);
    // these should probably be made class members so that they are only allocated once, not in every restart
    std::vector<FieldD> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    std::vector<FieldD> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, z, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(z, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<FieldD> &LinOp, std::vector<FieldD> &v, std::vector<FieldD> &z, FieldD &w, int iter) {
    FieldF v_f(SinglePrecGrid);
    FieldF z_f(SinglePrecGrid);
    ChangePrecTimer.Start();
    precisionChange(v_f, v[iter]);
    precisionChange(z_f, z[iter]);
    ChangePrecTimer.Stop();
    PrecTimer.Start();
    Preconditioner(v_f, z_f);
    PrecTimer.Stop();
    ChangePrecTimer.Start();
    precisionChange(z[iter], z_f);
    ChangePrecTimer.Stop();
    MatrixTimer.Start();
    LinOp.Op(z[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<FieldD> const &z, FieldD &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + z[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
@@ -139,7 +139,10 @@ namespace Grid {
      MatTimer.Start();
      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
      MatTimer.Stop();
      LinalgTimer.Start();
      r=src-Az;
      LinalgTimer.Stop();
      /////////////////////
      // p = Prec(r)
@@ -152,8 +155,10 @@ namespace Grid {
      Linop.HermOp(z,tmp); 
      MatTimer.Stop();
      LinalgTimer.Start();
      ttmp=tmp;
      tmp=tmp-r;
      LinalgTimer.Stop();
      /*
      std::cout<<GridLogMessage<<r<<std::endl;
@@ -166,12 +171,14 @@ namespace Grid {
      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
      MatTimer.Stop();
      LinalgTimer.Start();
      //p[0],q[0],qq[0] 
      p[0]= z;
      q[0]= Az;
      qq[0]= zAAz;
      cp =norm2(r);
      LinalgTimer.Stop();
      for(int k=0;k<nstep;k++){
@@ -181,12 +188,14 @@ namespace Grid {
 	int peri_k = k %mmax;
 	int peri_kp= kp%mmax;
        LinalgTimer.Start();
 	rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
 	a = rq/qq[peri_k];
 	axpy(psi,a,p[peri_k],psi);         
 	cp = axpy_norm(r,-a,q[peri_k],r);
        LinalgTimer.Stop();
 	if((k==nstep-1)||(cp<rsq)){
 	  return cp;
@@ -202,6 +211,8 @@ namespace Grid {
 	Linop.HermOpAndNorm(z,Az,zAz,zAAz);
 	Linop.HermOp(z,tmp);
 	MatTimer.Stop();
        LinalgTimer.Start();
        tmp=tmp-r;
 	std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
@@ -219,9 +230,9 @@ namespace Grid {
 	}
 	qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
-
+        LinalgTimer.Stop();
      }
      assert(0); // never reached
      return cp;
    }
@@ -50,8 +50,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
      assert(0);
  }
  Grid_quiesce_nodes();
  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
@@ -124,10 +122,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  // split the communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  //  int Nparent = parent._processors ; 
  //  std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
  int Nparent;
  MPI_Comm_size(parent.communicator,&Nparent);
  //  std::cout << " Parent size  "<<Nparent <<std::endl;
  int childsize=1;
  for(int d=0;d<processors.size();d++) {
@@ -136,8 +132,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
  int Nchild = Nparent/childsize;
  assert (childsize * Nchild == Nparent);
  //  std::cout << " child size  "<<childsize <<std::endl;
  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
  std::vector<int> scoor(_ndimension); // coor of split within parent
  std::vector<int> ssize(_ndimension); // coor of split within parent
@@ -413,7 +413,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -455,7 +455,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -499,7 +499,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
-      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
+      //      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
      if ( ptr == (void * )MAP_FAILED ) {       
 	perror("failed mmap");     
 	assert(0);    
@@ -464,9 +464,11 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
  assert(orthog>=0);
  for(int d=0;d<nh;d++){
    if ( d!=orthog ) {
      assert(lg->_processors[d]  == hg->_processors[d]);
      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
    }
  }
  // the above should guarantee that the operations are local
  parallel_for(int idx=0;idx<lg->lSites();idx++){
@@ -485,7 +487,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
 template<class vobj>
-void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
  typedef typename vobj::scalar_object sobj;
@@ -499,9 +501,11 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
  assert(orthog>=0);
  for(int d=0;d<nh;d++){
    if ( d!=orthog ) {
      assert(lg->_processors[d]  == hg->_processors[d]);
      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
    }
  }
  // the above should guarantee that the operations are local
  parallel_for(int idx=0;idx<lg->lSites();idx++){
@@ -59,6 +59,7 @@ void GridLogTimestamp(int on){
 }
 Colours GridLogColours(0);
 GridLogger GridLogMG     (1, "MG"    , GridLogColours, "NORMAL");
 GridLogger GridLogIRL    (1, "IRL"   , GridLogColours, "NORMAL");
 GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
@@ -169,6 +169,7 @@ public:
 void GridLogConfigure(std::vector<std::string> &logstreams);
 extern GridLogger GridLogMG;
 extern GridLogger GridLogIRL;
 extern GridLogger GridLogSolver;
 extern GridLogger GridLogError;
@@ -0,0 +1,3 @@
 #include <Grid/GridCore.h>
 int Grid::BinaryIO::latticeWriteMaxRetry = -1;
@@ -81,6 +81,7 @@ inline void removeWhitespace(std::string &key)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
 public:
  static int latticeWriteMaxRetry;
  /////////////////////////////////////////////////////////////////////////////
  // more byte manipulation helpers
@@ -370,7 +371,7 @@ PARALLEL_CRITICAL
 #endif
      } else {
 	std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
-                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
+                  << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
        std::ifstream fin;
 	fin.open(file, std::ios::binary | std::ios::in);
        if (control & BINARYIO_MASTER_APPEND)
@@ -582,7 +583,9 @@ PARALLEL_CRITICAL
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
    GridBase *grid = Umu._grid;
-    uint64_t lsites = grid->lSites();
+    uint64_t lsites = grid->lSites(), offsetCopy = offset;
    int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
    bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@@ -597,9 +600,35 @@ PARALLEL_CRITICAL
    grid->Barrier();
    timer.Stop();
-
+    while (attemptsLeft >= 0)
    {
      grid->Barrier();
      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
 	             nersc_csum,scidac_csuma,scidac_csumb);
      if (checkWrite)
      {
        std::vector<fobj> ckiodata(lsites);
        uint32_t          cknersc_csum, ckscidac_csuma, ckscidac_csumb;
        uint64_t          ckoffset = offsetCopy;
        std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
        grid->Barrier();
        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
 	               cknersc_csum,ckscidac_csuma,ckscidac_csumb);
        if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
        {
          std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
          offset = offsetCopy;
        }
        else
        {
          std::cout << GridLogMessage << "writeLatticeObject: read test checksum correct" << std::endl;
          break;
        }
      }
      attemptsLeft--;
    }
    std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
  }
@@ -725,5 +754,6 @@ PARALLEL_CRITICAL
    std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
  }
 };
 }
 #endif
@@ -65,7 +65,9 @@ inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & no
  GridSecs second(1);
  auto     secs       = now/second ; 
  auto     subseconds = now%second ;
  auto     fill       = stream.fill();
  stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s";
  stream.fill(fill);
  return stream;
 }
 inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
@@ -73,7 +75,9 @@ inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
  GridSecs second(1);
  auto     seconds    = now/second ; 
  auto     subseconds = now%second ;
  auto     fill       = stream.fill();
  stream << seconds<<"."<<std::setw(6)<<std::setfill('0')<<subseconds.count()<<" s";
  stream.fill(fill);
  return stream;
 }
@@ -44,12 +44,15 @@ namespace QCD {
  struct WilsonImplParams {
    bool overlapCommsCompute;
    std::vector<Real> twist_n_2pi_L;
    std::vector<Complex> boundary_phases;
    WilsonImplParams() : overlapCommsCompute(false) {
      boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
    };
-    WilsonImplParams(const std::vector<Complex> phi)
+    WilsonImplParams(const std::vector<Complex> phi) : boundary_phases(phi), overlapCommsCompute(false) {
-      : boundary_phases(phi), overlapCommsCompute(false) {}
+      twist_n_2pi_L.resize(Nd, 0.0);
    }
  };
  struct StaggeredImplParams {
@@ -240,16 +240,30 @@ namespace QCD {
      GaugeLinkField tmp(GaugeGrid);
      Lattice<iScalar<vInteger> > coor(GaugeGrid);
      ////////////////////////////////////////////////////
      // apply any boundary phase or twists
      ////////////////////////////////////////////////////
      for (int mu = 0; mu < Nd; mu++) {
 	////////// boundary phase /////////////
 	auto pha = Params.boundary_phases[mu];
 	scalar_type phase( real(pha),imag(pha) );
-        int Lmu = GaugeGrid->GlobalDimensions()[mu] - 1;
+	int L   = GaugeGrid->GlobalDimensions()[mu];
        int Lmu = L - 1;
        LatticeCoordinate(coor, mu);
        U = PeekIndex<LorentzIndex>(Umu, mu);
 	// apply any twists
 	RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
 	if ( theta != 0.0) { 
 	  scalar_type twphase(::cos(theta),::sin(theta));
 	  U = twphase*U;
 	  std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
 	}
        tmp = where(coor == Lmu, phase * U, U);
        PokeIndex<LorentzIndex>(Uds, tmp, mu);
@@ -4,9 +4,11 @@
 Source file: ./lib/qcd/action/gauge/Photon.h
- Copyright (C) 2015
+Copyright (C) 2015-2018
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <J.Harrison@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -30,8 +32,9 @@
 namespace Grid{
 namespace QCD{
  template <class S>
-  class QedGimpl
+  class QedGImpl
  {
  public:
    typedef S Simd;
@@ -43,27 +46,27 @@ namespace QCD{
    typedef iImplGaugeLink<Simd>  SiteLink;
    typedef iImplGaugeField<Simd> SiteField;
-    typedef SiteField             SiteComplex;
+    typedef SiteLink              SiteComplex;
    typedef Lattice<SiteLink>  LinkField;
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
  };
-  typedef QedGimpl<vComplex> QedGimplR;
+  typedef QedGImpl<vComplex> QedGImplR;
-  template<class Gimpl>
+  template <class GImpl>
  class Photon
  {
  public:
-    INHERIT_GIMPL_TYPES(Gimpl);
+    INHERIT_GIMPL_TYPES(GImpl);
    typedef typename SiteGaugeLink::scalar_object ScalarSite;
    typedef typename ScalarSite::scalar_type      ScalarComplex;
    GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
-    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2, qedInf, 3);
+    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2);
  public:
-    Photon(Gauge gauge, ZmScheme zmScheme);
+    Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvement);
-    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements);
+    Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme);
    Photon(Gauge gauge, ZmScheme zmScheme, Real G0);
    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements, Real G0);
    virtual ~Photon(void) = default;
    void FreePropagator(const GaugeField &in, GaugeField &out);
    void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
@@ -73,345 +76,255 @@ namespace QCD{
                         const GaugeLinkField &weight);
    void UnitField(GaugeField &out);
  private:
-    void infVolPropagator(GaugeLinkField &out);
+    void makeSpatialNorm(LatticeInteger &spNrm);
-    void invKHatSquared(GaugeLinkField &out);
+    void makeKHat(std::vector<GaugeLinkField> &khat);
    void makeInvKHatSquared(GaugeLinkField &out);
    void zmSub(GaugeLinkField &out);
    void transverseProjectSpatial(GaugeField &out);
    void gaugeTransform(GaugeField &out);
  private:
    GridBase          *grid_;
    Gauge             gauge_;
    ZmScheme          zmScheme_;
    std::vector<Real> improvement_;
    Real     G0_;
  };
-  typedef Photon<QedGimplR>  PhotonR;
+  typedef Photon<QedGImplR>  PhotonR;
-  template<class Gimpl>
+  template<class GImpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
+  Photon<GImpl>::Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme,
  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()),
    G0_(0.15493339023106021408483720810737508876916113364521)
  {}
  template<class Gimpl>
  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
                        std::vector<Real> improvements)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements),
+  : grid_(grid), gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements)
    G0_(0.15493339023106021408483720810737508876916113364521)
  {}
-  template<class Gimpl>
+  template<class GImpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme, Real G0)
+  Photon<GImpl>::Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme)
-  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()), G0_(G0)
+  : Photon(grid, gauge, zmScheme, std::vector<Real>())
  {}
-  template<class Gimpl>
+  template<class GImpl>
-  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
+  void Photon<GImpl>::FreePropagator(const GaugeField &in, GaugeField &out)
                        std::vector<Real> improvements, Real G0)
  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements), G0_(G0)
  {}
  template<class Gimpl>
  void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
  {
-    FFT theFFT(in._grid);
+    FFT        theFFT(dynamic_cast<GridCartesian *>(grid_));
-    
+    GaugeField in_k(grid_);
-    GaugeField in_k(in._grid);
+    GaugeField prop_k(grid_);
    GaugeField prop_k(in._grid);
    theFFT.FFT_all_dim(in_k, in, FFT::forward);
    MomentumSpacePropagator(prop_k, in_k);
    theFFT.FFT_all_dim(out, prop_k, FFT::backward);
  }
-  template<class Gimpl>
+  template<class GImpl>
-  void Photon<Gimpl>::infVolPropagator(GaugeLinkField &out)
+  void Photon<GImpl>::makeSpatialNorm(LatticeInteger &spNrm)
  {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
+    LatticeInteger   coor(grid_);
-    LatticeReal        xmu(grid);
+    std::vector<int> l = grid_->FullDimensions();
    GaugeLinkField     one(grid);
    const unsigned int nd    = grid->_ndimension;
    std::vector<int>   &l    = grid->_fdimensions;
    std::vector<int>   x0(nd,0);
    TComplex           Tone  = Complex(1.0,0.0);
    TComplex           Tzero = Complex(G0_,0.0);
    FFT                fft(grid);
-    one = Complex(1.0,0.0);
+    spNrm = zero;
-    out = zero;
+    for(int mu = 0; mu < grid_->Nd() - 1; mu++)
    for(int mu = 0; mu < nd; mu++)
    {
-      LatticeCoordinate(xmu,mu);
+      LatticeCoordinate(coor, mu);
-      Real lo2 = l[mu]/2.0;
+      coor  = where(coor < Integer(l[mu]/2), coor, coor - Integer(l[mu]));
-      xmu = where(xmu < lo2, xmu, xmu-double(l[mu]));
+      spNrm = spNrm + coor*coor;
      out = out + toComplex(4*M_PI*M_PI*xmu*xmu);
    }
    pokeSite(Tone, out, x0);
    out = one/out;
    pokeSite(Tzero, out, x0);
    fft.FFT_all_dim(out, out, FFT::forward);
  }
-  template<class Gimpl>
+  template<class GImpl>
-  void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
+  void Photon<GImpl>::makeKHat(std::vector<GaugeLinkField> &khat)
  {
-    GridBase           *grid = out._grid;
+    const unsigned int nd = grid_->Nd();
-    GaugeLinkField     kmu(grid), one(grid);
+    std::vector<int>   l  = grid_->FullDimensions();
-    const unsigned int nd    = grid->_ndimension;
+    Complex            ci(0., 1.);
-    std::vector<int>   &l    = grid->_fdimensions;
+
    khat.resize(nd, grid_);
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
      Real piL = M_PI/l[mu];
      LatticeCoordinate(khat[mu], mu);
      khat[mu] = exp(piL*ci*khat[mu])*2.*sin(piL*khat[mu]);
    }
  }
  template<class GImpl>
  void Photon<GImpl>::makeInvKHatSquared(GaugeLinkField &out)
  {
    std::vector<GaugeLinkField> khat;
    GaugeLinkField              lone(grid_);
    const unsigned int          nd = grid_->Nd();
    std::vector<int>            zm(nd, 0);
-    TComplex           Tone = Complex(1.0,0.0);
+    ScalarSite                  one = ScalarComplex(1., 0.), z = ScalarComplex(0., 0.);
    TComplex           Tzero= Complex(0.0,0.0);
    one = Complex(1.0,0.0);
    out = zero;
    makeKHat(khat);
    for(int mu = 0; mu < nd; mu++)
    {
-      Real twoPiL = M_PI*2./l[mu];
+      out = out + khat[mu]*conjugate(khat[mu]);
      LatticeCoordinate(kmu,mu);
      kmu = 2.*sin(.5*twoPiL*kmu);
      out = out + kmu*kmu;
    }
-    pokeSite(Tone, out, zm);
+    lone = ScalarComplex(1., 0.);
-    out = one/out;
+    pokeSite(one, out, zm);
-    pokeSite(Tzero, out, zm);
+    out = lone/out;
    pokeSite(z, out, zm);
  }
-  template<class Gimpl>
+  template<class GImpl>
-  void Photon<Gimpl>::zmSub(GaugeLinkField &out)
+  void Photon<GImpl>::zmSub(GaugeLinkField &out)
  {
    GridBase           *grid = out._grid;
    const unsigned int nd    = grid->_ndimension;
    std::vector<int>   &l    = grid->_fdimensions;
    switch (zmScheme_)
    {
      case ZmScheme::qedTL:
      {
-        std::vector<int> zm(nd,0);
+        std::vector<int> zm(grid_->Nd(), 0);
-        TComplex         Tzero = Complex(0.0,0.0);
+        ScalarSite       z = ScalarComplex(0., 0.);
        pokeSite(Tzero, out, zm);
        pokeSite(z, out, zm);
        break;
      }
      case ZmScheme::qedL:
      {
-        LatticeInteger spNrm(grid), coor(grid);
+        LatticeInteger spNrm(grid_);
        GaugeLinkField z(grid);
-        spNrm = zero;
+        makeSpatialNorm(spNrm);
        for(int d = 0; d < grid->_ndimension - 1; d++)
        {
          LatticeCoordinate(coor,d);
          coor = where(coor < Integer(l[d]/2), coor, coor-Integer(l[d]));
          spNrm = spNrm + coor*coor;
        }
        out = where(spNrm == Integer(0), 0.*out, out);
        // IR improvement
        for(int i = 0; i < improvement_.size(); i++)
        {
          Real f = sqrt(improvement_[i] + 1);
          out = where(spNrm == Integer(i + 1), f*out, out);
        }
        break;
      }
      default:
        assert(0);
        break;
    }
  }
-  template<class Gimpl>
+  template<class GImpl>
-  void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
+  void Photon<GImpl>::transverseProjectSpatial(GaugeField &out)
  {
    const unsigned int          nd = grid_->Nd();
    GaugeLinkField              invKHat(grid_), cst(grid_), spdiv(grid_);
    LatticeInteger              spNrm(grid_);
    std::vector<GaugeLinkField> khat, a(nd, grid_), aProj(nd, grid_);
    invKHat = zero;
    makeSpatialNorm(spNrm);
    makeKHat(khat);
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
      a[mu] = peekLorentz(out, mu);
      if (mu < nd - 1)
      {
        invKHat += khat[mu]*conjugate(khat[mu]);
      }
    }
    cst     = ScalarComplex(1., 0.);
    invKHat = where(spNrm == Integer(0), cst, invKHat);
    invKHat = cst/invKHat;
    cst     = zero;
    invKHat = where(spNrm == Integer(0), cst, invKHat);
    spdiv   = zero;
    for (unsigned int nu = 0; nu < nd - 1; ++nu)
    {
      spdiv += conjugate(khat[nu])*a[nu];
    }
    spdiv *= invKHat;
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
      aProj[mu] = a[mu] - khat[mu]*spdiv;
      pokeLorentz(out, aProj[mu], mu);
    }
  }
  template<class GImpl>
  void Photon<GImpl>::gaugeTransform(GaugeField &out)
  {
    switch (gauge_)
    {
      case Gauge::feynman:
        break;
      case Gauge::coulomb:
        transverseProjectSpatial(out);
        break;
      case Gauge::landau:
        assert(0);
        break;
      default:
        assert(0);
        break;
    }
  }
  template<class GImpl>
  void Photon<GImpl>::MomentumSpacePropagator(const GaugeField &in,
                                              GaugeField &out)
  {
-  GridBase           *grid = out._grid;
+    LatticeComplex momProp(grid_);
    LatticeComplex     momProp(grid);
-    switch (zmScheme_)
+    makeInvKHatSquared(momProp);
    {
      case ZmScheme::qedTL:
      case ZmScheme::qedL:
      {
        invKHatSquared(momProp);
    zmSub(momProp);
        break;
      }
      case ZmScheme::qedInf:
      {
        infVolPropagator(momProp);
        break;
      }
      default:
        break;
    }
    out = in*momProp;
  }
-  template<class Gimpl>
+  template<class GImpl>
-  void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
+  void Photon<GImpl>::StochasticWeight(GaugeLinkField &weight)
  {
    auto               *grid     = dynamic_cast<GridCartesian *>(weight._grid);
    const unsigned int nd        = grid->_ndimension;
    std::vector<int>   latt_size = grid->_fdimensions;
    switch (zmScheme_)
    {
      case ZmScheme::qedTL:
      case ZmScheme::qedL:
  {
    const unsigned int nd  = grid_->Nd();
    std::vector<int>   l   = grid_->FullDimensions();
    Integer            vol = 1;
-        for(int d = 0; d < nd; d++)
+
    for(unsigned int mu = 0; mu < nd; mu++)
    {
-          vol = vol * latt_size[d];
+      vol = vol*l[mu];
    }
-        invKHatSquared(weight);
+    makeInvKHatSquared(weight);
    weight = sqrt(vol)*sqrt(weight);
    zmSub(weight);
        break;
      }
      case ZmScheme::qedInf:
      {
        infVolPropagator(weight);
        weight = sqrt(real(weight));
        break;
      }
      default:
        break;
    }
  }
-  template<class Gimpl>
+  template<class GImpl>
-  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
+  void Photon<GImpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
  {
-    auto           *grid = dynamic_cast<GridCartesian *>(out._grid);
+    GaugeLinkField weight(grid_);
    GaugeLinkField weight(grid);
    StochasticWeight(weight);
    StochasticField(out, rng, weight);
  }
-  template<class Gimpl>
+  template<class GImpl>
-  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
+  void Photon<GImpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
                                      const GaugeLinkField &weight)
  {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
+    const unsigned int nd = grid_->Nd();
-    const unsigned int nd = grid->_ndimension;
+    GaugeLinkField     r(grid_);
-    GaugeLinkField     r(grid);
+    GaugeField         aTilde(grid_);
-    GaugeField         aTilde(grid);
+    FFT                fft(dynamic_cast<GridCartesian *>(grid_));
    FFT                fft(grid);
-    switch (zmScheme_)
+    for(unsigned int mu = 0; mu < nd; mu++)
    {
      case ZmScheme::qedTL:
      case ZmScheme::qedL:
      {
        for(int mu = 0; mu < nd; mu++)
    {
      gaussian(rng, r);
      r = weight*r;
      pokeLorentz(aTilde, r, mu);
    }
-        break;
+    gaugeTransform(aTilde);
      }
      case ZmScheme::qedInf:
      {
        Complex                    shift(1., 1.); // This needs to be a GaugeLink element?
        for(int mu = 0; mu < nd; mu++)
        {
          bernoulli(rng, r);
          r = weight*(2.*r - shift);
          pokeLorentz(aTilde, r, mu);
        }
        break;
      }
      default:
        break;
    }
    fft.FFT_all_dim(out, aTilde, FFT::backward);
    out = real(out);
  }
-  template<class Gimpl>
+  template<class GImpl>
-  void Photon<Gimpl>::UnitField(GaugeField &out)
+  void Photon<GImpl>::UnitField(GaugeField &out)
  {
-    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
+    const unsigned int nd = grid_->Nd();
-    const unsigned int nd = grid->_ndimension;
+    GaugeLinkField     r(grid_);
    GaugeLinkField     r(grid);
-    r = Complex(1.0,0.0);
+    r = ScalarComplex(1., 0.);
-
+    for(unsigned int mu = 0; mu < nd; mu++)
    for(int mu = 0; mu < nd; mu++)
    {
      pokeLorentz(out, r, mu);
    }
    out = real(out);
  }
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
 //                                                            const GaugeField &in)
 //  {
 //    
 //    FeynmanGaugeMomentumSpacePropagator_TL(out,in);
 //    
 //    GridBase *grid = out._grid;
 //    LatticeInteger     coor(grid);
 //    GaugeField zz(grid); zz=zero;
 //    
 //    // xyzt
 //    for(int d = 0; d < grid->_ndimension-1;d++){
 //      LatticeCoordinate(coor,d);
 //      out = where(coor==Integer(0),zz,out);
 //    }
 //  }
 //  
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
 //                                                             const GaugeField &in)
 //  {
 //    
 //    // what type LatticeComplex
 //    GridBase *grid = out._grid;
 //    int nd = grid->_ndimension;
 //    
 //    typedef typename GaugeField::vector_type vector_type;
 //    typedef typename GaugeField::scalar_type ScalComplex;
 //    typedef Lattice<iSinglet<vector_type> > LatComplex;
 //    
 //    std::vector<int> latt_size   = grid->_fdimensions;
 //    
 //    LatComplex denom(grid); denom= zero;
 //    LatComplex   one(grid); one = ScalComplex(1.0,0.0);
 //    LatComplex   kmu(grid);
 //    
 //    ScalComplex ci(0.0,1.0);
 //    // momphase = n * 2pi / L
 //    for(int mu=0;mu<Nd;mu++) {
 //      
 //      LatticeCoordinate(kmu,mu);
 //      
 //      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
 //      
 //      kmu = TwoPiL * kmu ;
 //      
 //      denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
 //    }
 //    std::vector<int> zero_mode(nd,0);
 //    TComplexD Tone = ComplexD(1.0,0.0);
 //    TComplexD Tzero= ComplexD(0.0,0.0);
 //    
 //    pokeSite(Tone,denom,zero_mode);
 //    
 //    denom= one/denom;
 //    
 //    pokeSite(Tzero,denom,zero_mode);
 //    
 //    out = zero;
 //    out = in*denom;
 //  };
 }}
 #endif
@@ -173,6 +173,39 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
    }
  }
 }
 }
-}}
+// I explicitly need these outside the QCD namespace
 template<typename vobj>
 void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
 {
  GridBase *grid = x._grid;
  z.checkerboard = x.checkerboard;
  conformable(x, z);
  QCD::Gamma G5(QCD::Gamma::Algebra::Gamma5);
  z = G5 * x;
 }
 template<class CComplex, int nbasis>
 void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x)
 {
  GridBase *grid = x._grid;
  z.checkerboard = x.checkerboard;
  conformable(x, z);
  static_assert(nbasis % 2 == 0, "");
  int nb = nbasis / 2;
  parallel_for(int ss = 0; ss < grid->oSites(); ss++) {
    for(int n = 0; n < nb; ++n) {
      z._odata[ss](n) = x._odata[ss](n);
    }
    for(int n = nb; n < nbasis; ++n) {
      z._odata[ss](n) = -x._odata[ss](n);
    }
  }
 }
 }
 #endif 
@@ -10,6 +10,8 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: neo <cossu@post.kek.jp>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: James Harrison <J.Harrison@soton.ac.uk>
    Author: Antonin Portelli <antonin.portelli@me.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -645,6 +647,184 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      }
    }
  }
  //////////////////////////////////////////////////
  // Wilson loop of size (R1, R2), oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void wilsonLoop(GaugeMat &wl, const std::vector<GaugeMat> &U,
                           const int Rmu, const int Rnu,
                           const int mu, const int nu) {
    wl = U[nu];
    for(int i = 0; i < Rnu-1; i++){
      wl = Gimpl::CovShiftForward(U[nu], nu, wl);
    }
    for(int i = 0; i < Rmu; i++){
      wl = Gimpl::CovShiftForward(U[mu], mu, wl);
    }
    for(int i = 0; i < Rnu; i++){
      wl = Gimpl::CovShiftBackward(U[nu], nu, wl);
    }
    for(int i = 0; i < Rmu; i++){
      wl = Gimpl::CovShiftBackward(U[mu], mu, wl);
    }
  }
  //////////////////////////////////////////////////
  // trace of Wilson Loop oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void traceWilsonLoop(LatticeComplex &wl,
                                const std::vector<GaugeMat> &U,
                                const int Rmu, const int Rnu,
                                const int mu, const int nu) {
    GaugeMat sp(U[0]._grid);
    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
    wl = trace(sp);
  }
  //////////////////////////////////////////////////
  // sum over all planes of Wilson loop
  //////////////////////////////////////////////////
  static void siteWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    Wl = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
        Wl = Wl + siteWl;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over planes of Wilson loop with length R1
  // in the time direction
  //////////////////////////////////////////////////
  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    int ndim = U[0]._grid->_ndimension;
    Wl = zero;
    for (int nu = 0; nu < ndim - 1; nu++) {
      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
      Wl = Wl + siteWl;
    }
  }
  //////////////////////////////////////////////////
  // sum Wilson loop over all planes orthogonal to the time direction
  //////////////////////////////////////////////////
  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    Wl = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
        Wl = Wl + siteWl;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of Wilson loop
  //////////////////////////////////////////////////
  static Real sumWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of timelike Wilson loop
  //////////////////////////////////////////////////
  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteTimelikeWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteSpatialWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of Wilson loop
  //////////////////////////////////////////////////
  static Real avgWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * ndim * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of timelike Wilson loop
  //////////////////////////////////////////////////
  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
 };
 typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
@@ -61,9 +61,9 @@ Group & Hdf5Writer::getGroup(void)
 }
 // Reader implementation ///////////////////////////////////////////////////////
-Hdf5Reader::Hdf5Reader(const std::string &fileName)
+Hdf5Reader::Hdf5Reader(const std::string &fileName, const bool readOnly)
 : fileName_(fileName)
-, file_(fileName.c_str(), H5F_ACC_RDWR)
+, file_(fileName.c_str(), readOnly ? H5F_ACC_RDONLY : H5F_ACC_RDWR)
 {
  group_ = file_.openGroup("/");
  readSingleAttribute(dataSetThres_, HDF5_GRID_GUARD "dataset_threshold",
@@ -54,7 +54,7 @@ namespace Grid
  class Hdf5Reader: public Reader<Hdf5Reader>
  {
  public:
-    Hdf5Reader(const std::string &fileName);
+    Hdf5Reader(const std::string &fileName, const bool readOnly = true);
    virtual ~Hdf5Reader(void) = default;
    bool push(const std::string &s);
    void pop(void);
@@ -124,8 +124,11 @@ namespace Grid
    if (flatx.size() > dataSetThres_)
    {
      H5NS::DataSet           dataSet;
      H5NS::DSetCreatPropList plist;
-      dataSet = group_.createDataSet(s, Hdf5Type<Element>::type(), dataSpace);
+      plist.setChunk(dim.size(), dim.data());
      plist.setFletcher32();
      dataSet = group_.createDataSet(s, Hdf5Type<Element>::type(), dataSpace, plist);
      dataSet.write(flatx.data(), Hdf5Type<Element>::type());
    }
    else
@@ -47,6 +47,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #define PARALLEL_FOR_LOOP
 #define PARALLEL_FOR_LOOP_INTERN
 #define PARALLEL_FOR_LOOP_REDUCE(op, var)
 #define PARALLEL_NESTED_LOOP2
 #define PARALLEL_NESTED_LOOP5
 #define PARALLEL_REGION
@@ -58,6 +59,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 #define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
 #define parallel_critical PARALLEL_CRITICAL
 namespace Grid {
@@ -28,16 +28,31 @@
 extern "C" {
 #include <openssl/sha.h>
 }
 #ifdef USE_IPP
 #include "ipp.h"
 #endif
 #pragma once
 class GridChecksum
 {
 public:
-  static inline uint32_t crc32(void *data,size_t bytes)
+  static inline uint32_t crc32(const void *data, size_t bytes)
  {
    return ::crc32(0L,(unsigned char *)data,bytes);
  }
 #ifdef USE_IPP
  static inline uint32_t crc32c(const void* data, size_t bytes)
  {
      uint32_t crc32c = ~(uint32_t)0;
      ippsCRC32C_8u(reinterpret_cast<const unsigned char *>(data), bytes, &crc32c);
      ippsSwapBytes_32u_I(&crc32c, 1);
      return ~crc32c;
  }
 #endif
  template <typename T>
  static inline std::string sha256_string(const std::vector<T> &hash)
  {
@@ -32,11 +32,19 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Global.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
 #ifdef USE_MKL
 #include "mkl.h"
 #include "mkl_cblas.h"
 #endif
 #ifndef HADRONS_A2AM_NAME 
 #define HADRONS_A2AM_NAME "a2aMatrix"
 #endif
 #ifndef HADRONS_A2AM_IO_TYPE
 #define HADRONS_A2AM_IO_TYPE ComplexF
 #endif
 #define HADRONS_A2AM_PARALLEL_IO
 BEGIN_HADRONS_NAMESPACE
@@ -51,6 +59,15 @@ BEGIN_HADRONS_NAMESPACE
 template <typename T>
 using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
 template <typename T>
 using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
 template <typename T>
 using A2AMatrixMap = Eigen::Map<A2AMatrix<T>>;
 template <typename T>
 using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
 /******************************************************************************
 *                      Abstract class for A2A kernels                        *
 ******************************************************************************/
@@ -76,10 +93,15 @@ public:
    // constructors
    A2AMatrixIo(void) = default;
    A2AMatrixIo(std::string filename, std::string dataname, 
-                const unsigned int nt, const unsigned int ni,
+                const unsigned int nt, const unsigned int ni = 0,
-                const unsigned int nj);
+                const unsigned int nj = 0);
    // destructor
    ~A2AMatrixIo(void) = default;
    // access
    unsigned int getNi(void) const;
    unsigned int getNj(void) const;
    unsigned int getNt(void) const;
    size_t       getSize(void) const;
    // file allocation
    template <typename MetadataType>
    void initFile(const MetadataType &d, const unsigned int chunkSize);
@@ -88,9 +110,11 @@ public:
                   const unsigned int blockSizei, const unsigned int blockSizej);
    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
                   const unsigned int i, const unsigned int j);
    template <template <class> class Vec, typename VecT>
    void load(Vec<VecT> &v, double *tRead = nullptr, const bool useCache = true);
 private:
-    std::string  filename_, dataname_;
+    std::string  filename_{""}, dataname_{""};
-    unsigned int nt_, ni_, nj_;
+    unsigned int nt_{0}, ni_{0}, nj_{0};
 };
 /******************************************************************************
@@ -136,6 +160,226 @@ private:
    std::vector<IoHelper> nodeIo_;
 };
 /******************************************************************************
 *                       A2A matrix contraction kernels                       *
 ******************************************************************************/
 class A2AContraction
 {
 public:
    // accTrMul(acc, a, b): acc += tr(a*b)
    template <typename C, typename MatLeft, typename MatRight>
    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
    {
        if ((MatLeft::Options == Eigen::RowMajor) and
            (MatRight::Options == Eigen::ColMajor))
        {
            parallel_for (unsigned int r = 0; r < a.rows(); ++r)
            {
                C tmp;
 #ifdef USE_MKL
                dotuRow(tmp, r, a, b);
 #else
                tmp = a.row(r).conjugate().dot(b.col(r));
 #endif
                parallel_critical
                {
                    acc += tmp;
                }
            }
        }
        else
        {
            parallel_for (unsigned int c = 0; c < a.cols(); ++c)
            {
                C tmp;
 #ifdef USE_MKL 
                dotuCol(tmp, c, a, b);
 #else
                tmp = a.col(c).conjugate().dot(b.row(c));
 #endif
                parallel_critical
                {
                    acc += tmp;
                }
            }
        }
    }
    template <typename MatLeft, typename MatRight>
    static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
    {
        double n = a.rows()*a.cols();
        return 8.*n;
    }
    // mul(res, a, b): res = a*b
 #ifdef USE_MKL
    template <template <class, int...> class Mat, int... Opts>
    static inline void mul(Mat<ComplexD, Opts...> &res, 
                           const Mat<ComplexD, Opts...> &a, 
                           const Mat<ComplexD, Opts...> &b)
    {
        static const ComplexD one(1., 0.), zero(0., 0.);
        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
        {
            res.resize(a.rows(), b.cols());
        }
        if (Mat<ComplexD, Opts...>::Options == Eigen::RowMajor)
        {
            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
                        res.data(), res.cols());
        }
        else if (Mat<ComplexD, Opts...>::Options == Eigen::ColMajor)
        {
            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
                        res.data(), res.rows());
        }
    }
    template <template <class, int...> class Mat, int... Opts>
    static inline void mul(Mat<ComplexF, Opts...> &res, 
                           const Mat<ComplexF, Opts...> &a, 
                           const Mat<ComplexF, Opts...> &b)
    {
        static const ComplexF one(1., 0.), zero(0., 0.);
        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
        {
            res.resize(a.rows(), b.cols());
        }
        if (Mat<ComplexF, Opts...>::Options == Eigen::RowMajor)
        {
            cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
                        res.data(), res.cols());
        }
        else if (Mat<ComplexF, Opts...>::Options == Eigen::ColMajor)
        {
            cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
                        res.data(), res.rows());
        }
    }
 #else
    template <typename Mat>
    static inline void mul(Mat &res, const Mat &a, const Mat &b)
    {
        res = a*b;
    }
 #endif
    template <typename Mat>
    static inline double mulFlops(const Mat &a, const Mat &b)
    {
        double nr = a.rows(), nc = a.cols();
        return nr*nr*(6.*nc + 2.*(nc - 1.));
    }
 private:
    template <typename C, typename MatLeft, typename MatRight>
    static inline void makeDotRowPt(C * &aPt, unsigned int &aInc, C * &bPt, 
                                    unsigned int &bInc, const unsigned int aRow, 
                                    const MatLeft &a, const MatRight &b)
    {
        if (MatLeft::Options == Eigen::RowMajor)
        {
            aPt  = a.data() + aRow*a.cols();
            aInc = 1;
        }
        else if (MatLeft::Options == Eigen::ColMajor)
        {
            aPt  = a.data() + aRow;
            aInc = a.rows();
        }
        if (MatRight::Options == Eigen::RowMajor)
        {
            bPt  = b.data() + aRow;
            bInc = b.cols();
        }
        else if (MatRight::Options == Eigen::ColMajor)
        {
            bPt  = b.data() + aRow*b.rows();
            bInc = 1;
        }
    }
 #ifdef USE_MKL
    template <typename C, typename MatLeft, typename MatRight>
    static inline void makeDotColPt(C * &aPt, unsigned int &aInc, C * &bPt, 
                                    unsigned int &bInc, const unsigned int aCol, 
                                    const MatLeft &a, const MatRight &b)
    {
        if (MatLeft::Options == Eigen::RowMajor)
        {
            aPt  = a.data() + aCol;
            aInc = a.cols();
        }
        else if (MatLeft::Options == Eigen::ColMajor)
        {
            aPt  = a.data() + aCol*a.rows();
            aInc = 1;
        }
        if (MatRight::Options == Eigen::RowMajor)
        {
            bPt  = b.data() + aCol*b.cols();
            bInc = 1;
        }
        else if (MatRight::Options == Eigen::ColMajor)
        {
            bPt  = b.data() + aCol;
            bInc = b.rows();
        }
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuRow(ComplexF &res, const unsigned int aRow,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexF *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
        cblas_cdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuCol(ComplexF &res, const unsigned int aCol,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexF *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
        cblas_cdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuRow(ComplexD &res, const unsigned int aRow,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexD *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
        cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuCol(ComplexD &res, const unsigned int aCol,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexD *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
        cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
    }
 #endif
 };
 /******************************************************************************
 *                     A2AMatrixIo template implementation                    *
 ******************************************************************************/
@@ -148,6 +392,31 @@ A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname,
 , nt_(nt), ni_(ni), nj_(nj)
 {}
 // access //////////////////////////////////////////////////////////////////////
 template <typename T>
 unsigned int A2AMatrixIo<T>::getNt(void) const
 {
    return nt_;
 }
 template <typename T>
 unsigned int A2AMatrixIo<T>::getNi(void) const
 {
    return ni_;
 }
 template <typename T>
 unsigned int A2AMatrixIo<T>::getNj(void) const
 {
    return nj_;
 }
 template <typename T>
 size_t A2AMatrixIo<T>::getSize(void) const
 {
    return nt_*ni_*nj_*sizeof(T);
 }
 // file allocation /////////////////////////////////////////////////////////////
 template <typename T>
 template <typename MetadataType>
@@ -171,11 +440,12 @@ void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSiz
    }
    // create the dataset
-    Hdf5Reader reader(filename_);
+    Hdf5Reader reader(filename_, false);
    push(reader, dataname_);
    auto &group = reader.getGroup();
    plist.setChunk(chunk.size(), chunk.data());
    plist.setFletcher32();
    dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type<T>::type(), dataspace, plist);
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
@@ -191,7 +461,7 @@ void A2AMatrixIo<T>::saveBlock(const T *data,
                               const unsigned int blockSizej)
 {
 #ifdef HAVE_HDF5
-    Hdf5Reader           reader(filename_);
+    Hdf5Reader           reader(filename_, false);
    std::vector<hsize_t> count = {nt_, blockSizei, blockSizej},
                         offset = {0, static_cast<hsize_t>(i),
                                   static_cast<hsize_t>(j)},
@@ -226,6 +496,99 @@ void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
    saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
 }
 template <typename T>
 template <template <class> class Vec, typename VecT>
 void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead, const bool useCache)
 {
 #ifdef HAVE_HDF5
    Hdf5Reader           reader(filename_);
    std::vector<hsize_t> hdim;
    H5NS::DataSet        dataset;
    H5NS::DataSpace      dataspace;
    H5NS::CompType       datatype;
    push(reader, dataname_);
    auto &group = reader.getGroup();
    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
    datatype    = dataset.getCompType();
    dataspace   = dataset.getSpace();
    hdim.resize(dataspace.getSimpleExtentNdims());
    dataspace.getSimpleExtentDims(hdim.data());
    if ((nt_*ni_*nj_ != 0) and
        ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
    {
        HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
            + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
            + std::to_string(hdim[2]) + ", expected "
            + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
            + std::to_string(nj_));
    }
    else if (ni_*nj_ == 0)
    {
        if (hdim[0] != nt_)
        {
            HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
                + std::to_string(hdim[0]) + ", expected "
                + std::to_string(nt_) + ")");
        }
        ni_ = hdim[1];
        nj_ = hdim[2];
    }
    if (useCache)
    {
        std::vector<T> buf(nt_*ni_*nj_);
        T              *pt;
        dataset.read(buf.data(), datatype);
        pt = buf.data();
        for (unsigned int t = 0; t < nt_; ++t)
        {
            A2AMatrixMap<T> bufMap(pt, ni_, nj_);
            v[t]  = bufMap.template cast<VecT>();
            pt   += ni_*nj_;
        }
    }
    // if useCache = false, do I/O timeslice per timeslice (much slower)
    else
    {
        A2AMatrix<T>         buf(ni_, nj_);
        std::vector<hsize_t> count    = {1, static_cast<hsize_t>(ni_),
                                        static_cast<hsize_t>(nj_)},
                             stride   = {1, 1, 1},
                             block    = {1, 1, 1},
                             memCount = {static_cast<hsize_t>(ni_),
                                         static_cast<hsize_t>(nj_)};
        H5NS::DataSpace      memspace(memCount.size(), memCount.data());
        std::cout << "Loading timeslice";
        std::cout.flush();
        *tRead = 0.;
        for (unsigned int tp1 = nt_; tp1 > 0; --tp1)
        {
            unsigned int         t      = tp1 - 1;
            std::vector<hsize_t> offset = {static_cast<hsize_t>(t), 0, 0};
            if (t % 10 == 0)
            {
                std::cout << " " << t;
                std::cout.flush();
            }
            dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
                                      stride.data(), block.data());
            if (tRead) *tRead -= usecond();    
            dataset.read(buf.data(), datatype, memspace, dataspace);
            if (tRead) *tRead += usecond();
            v[t] = buf.template cast<VecT>();
        }
        std::cout << std::endl;
    }
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }
 /******************************************************************************
 *               A2AMatrixBlockComputation template implementation            *
 ******************************************************************************/
@@ -108,6 +108,9 @@ void Application::run(void)
        HADRONS_ERROR(Definition, "run id is empty");
    }
    LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
    BinaryIO::latticeWriteMaxRetry = getPar().parallelWriteMaxRetry;
    LOG(Message) << "Attempt(s) for resilient parallel I/O: " 
                 << BinaryIO::latticeWriteMaxRetry << std::endl;
    vm().setRunId(getPar().runId);
    vm().printContent();
    env().printContent();
@@ -41,14 +41,6 @@ BEGIN_HADRONS_NAMESPACE
 class Application
 {
 public:
    class TrajRange: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
                                        unsigned int, start,
                                        unsigned int, end,
                                        unsigned int, step);
    };
    class GlobalPar: Serializable
    {
    public:
@@ -56,7 +48,9 @@ public:
                                        TrajRange,                  trajCounter,
                                        VirtualMachine::GeneticPar, genetic,
                                        std::string,                runId,
-                                        std::string,                graphFile);
+                                        std::string,                graphFile,
                                        int,                        parallelWriteMaxRetry);
        GlobalPar(void): parallelWriteMaxRetry{-1} {}
    };
 public:
    // constructors
@@ -29,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #define Hadrons_DiskVector_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #include <deque>
 #include <sys/stat.h>
 #include <ftw.h>
@@ -59,14 +60,18 @@ public:
        : master_(master), cmaster_(master), i_(i) {}
        // operator=: somebody is trying to store a vector element
-        // write to disk and cache
+        // write to cache and tag as modified
        T &operator=(const T &obj) const
        {
            auto &cache    = *master_.cachePtr_;
            auto &modified = *master_.modifiedPtr_;
            auto &index    = *master_.indexPtr_;
            DV_DEBUG_MSG(&master_, "writing to " << i_);
            master_.cacheInsert(i_, obj);
-            master_.save(master_.filename(i_), obj);
+            modified[index.at(i_)] = true;
-            return master_.cachePtr_->at(i_);
+            return cache[index.at(i_)];
        }
        // implicit cast to const object reference and redirection
@@ -83,6 +88,7 @@ public:
 public:
    DiskVectorBase(const std::string dirname, const unsigned int size = 0,
                   const unsigned int cacheSize = 1, const bool clean = true);
    DiskVectorBase(DiskVectorBase<T> &&v) = default;
    virtual ~DiskVectorBase(void);
    const T & operator[](const unsigned int i) const;
    RwAccessHelper operator[](const unsigned int i);
@@ -103,7 +109,10 @@ private:
    bool                                                  clean_;
    // using pointers to allow modifications when class is const
    // semantic: const means data unmodified, but cache modification allowed
-    std::unique_ptr<std::map<unsigned int, T>> cachePtr_;
+    std::unique_ptr<std::vector<T>>                       cachePtr_;
    std::unique_ptr<std::vector<bool>>                    modifiedPtr_;
    std::unique_ptr<std::map<unsigned int, unsigned int>> indexPtr_;
    std::unique_ptr<std::stack<unsigned int>>             freePtr_;
    std::unique_ptr<std::deque<unsigned int>>             loadsPtr_;                
 };
@@ -135,7 +144,7 @@ private:
 *                      Specialisation for Eigen matrices                     *
 ******************************************************************************/
 template <typename T>
-using EigenDiskVectorMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
+using EigenDiskVectorMat = A2AMatrix<T>;
 template <typename T>
 class EigenDiskVector: public DiskVectorBase<EigenDiskVectorMat<T>>
@@ -153,23 +162,30 @@ private:
    virtual void load(EigenDiskVectorMat<T> &obj, const std::string filename) const
    {
        std::ifstream f(filename, std::ios::binary);
-        std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
+        uint32_t      crc, check;
        Eigen::Index  nRow, nCol;
        size_t        matSize;
-        double                     t;
+        double        tRead, tHash;
-        f.read(reinterpret_cast<char *>(hash.data()), hash.size()*sizeof(unsigned char));
+        f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
-        f.read(reinterpret_cast<char *>(&nRow), sizeof(Eigen::Index));
+        f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
-        f.read(reinterpret_cast<char *>(&nCol), sizeof(Eigen::Index));
+        f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
        obj.resize(nRow, nCol);
        matSize = nRow*nCol*sizeof(T);
-        t  = -usecond();
+        tRead  = -usecond();
        f.read(reinterpret_cast<char *>(obj.data()), matSize);
-        t += usecond();
+        tRead += usecond();
-        DV_DEBUG_MSG(this, "Eigen read " << matSize/t*1.0e6/1024/1024 << " MB/s");
+        tHash  = -usecond();
-        auto check = GridChecksum::sha256(obj.data(), matSize);
+#ifdef USE_IPP
-        DV_DEBUG_MSG(this, "Eigen sha256 " << GridChecksum::sha256_string(check));
+        check  = GridChecksum::crc32c(obj.data(), matSize);
-        if (hash != check)
+#else
        check  = GridChecksum::crc32(obj.data(), matSize);
 #endif
        tHash += usecond();
        DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
        if (crc != check)
        {
            HADRONS_ERROR(Io, "checksum failed")
        }
@@ -178,23 +194,30 @@ private:
    virtual void save(const std::string filename, const EigenDiskVectorMat<T> &obj) const
    {
        std::ofstream f(filename, std::ios::binary);
-        std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
+        uint32_t      crc;
        Eigen::Index  nRow, nCol;
        size_t        matSize;
-        double                     t;
+        double        tWrite, tHash;
        nRow    = obj.rows();
        nCol    = obj.cols();
        matSize = nRow*nCol*sizeof(T);
-        hash    = GridChecksum::sha256(obj.data(), matSize);
+        tHash   = -usecond();
-        DV_DEBUG_MSG(this, "Eigen sha256 " << GridChecksum::sha256_string(hash));
+#ifdef USE_IPP
-        f.write(reinterpret_cast<char *>(hash.data()), hash.size()*sizeof(unsigned char));
+        crc     = GridChecksum::crc32c(obj.data(), matSize);
-        f.write(reinterpret_cast<char *>(&nRow), sizeof(Eigen::Index));
+#else
-        f.write(reinterpret_cast<char *>(&nCol), sizeof(Eigen::Index));
+        crc     = GridChecksum::crc32(obj.data(), matSize);
-        t  = -usecond();
+#endif
        tHash  += usecond();
        f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
        f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
        f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
        tWrite = -usecond();
        f.write(reinterpret_cast<const char *>(obj.data()), matSize);
-        t += usecond();
+        tWrite += usecond();
-        DV_DEBUG_MSG(this, "Eigen write " << matSize/t*1.0e6/1024/1024 << " MB/s");
+        DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
    }
 };
@@ -207,7 +230,10 @@ DiskVectorBase<T>::DiskVectorBase(const std::string dirname,
                                  const unsigned int cacheSize,
                                  const bool clean)
 : dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean)
-, cachePtr_(new std::map<unsigned int, T>())
+, cachePtr_(new std::vector<T>(size))
 , modifiedPtr_(new std::vector<bool>(size, false))
 , indexPtr_(new std::map<unsigned int, unsigned int>())
 , freePtr_(new std::stack<unsigned int>)
 , loadsPtr_(new std::deque<unsigned int>())
 {
    struct stat s;
@@ -217,6 +243,10 @@ DiskVectorBase<T>::DiskVectorBase(const std::string dirname,
        HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
    }
    mkdir(dirname);
    for (unsigned int i = 0; i < cacheSize_; ++i)
    {
        freePtr_->push(i);
    }
 }
 template <typename T>
@@ -232,6 +262,8 @@ template <typename T>
 const T & DiskVectorBase<T>::operator[](const unsigned int i) const
 {
    auto &cache   = *cachePtr_;
    auto &index   = *indexPtr_;
    auto &freeInd = *freePtr_;
    auto &loads   = *loadsPtr_;
    DV_DEBUG_MSG(this, "accessing " << i << " (RO)");
@@ -241,7 +273,7 @@ const T & DiskVectorBase<T>::operator[](const unsigned int i) const
        HADRONS_ERROR(Size, "index out of range");
    }
    const_cast<double &>(access_)++;
-    if (cache.find(i) == cache.end())
+    if (index.find(i) == index.end())
    {
        // cache miss
        DV_DEBUG_MSG(this, "cache miss");
@@ -268,7 +300,7 @@ const T & DiskVectorBase<T>::operator[](const unsigned int i) const
    DV_DEBUG_MSG(this, "in cache: " << msg);
 #endif
-    return cache.at(i);
+    return cache[index.at(i)];
 }
 template <typename T>
@@ -307,12 +339,23 @@ template <typename T>
 void DiskVectorBase<T>::evict(void) const
 {
    auto &cache    = *cachePtr_;
    auto &modified = *modifiedPtr_;
    auto &index    = *indexPtr_;
    auto &freeInd  = *freePtr_;
    auto &loads    = *loadsPtr_;
-    if (cache.size() >= cacheSize_)
+    if (index.size() >= cacheSize_)
    {
-        DV_DEBUG_MSG(this, "evicting " << loads.front());
+        unsigned int i = loads.front();
-        cache.erase(loads.front());
+        
        DV_DEBUG_MSG(this, "evicting " << i);
        if (modified[index.at(i)])
        {
            DV_DEBUG_MSG(this, "element " << i << " modified, saving to disk");
            save(filename(i), cache[index.at(i)]);
        }
        freeInd.push(index.at(i));
        index.erase(i);
        loads.pop_front();
    }
 }
@@ -321,29 +364,43 @@ template <typename T>
 void DiskVectorBase<T>::fetch(const unsigned int i) const
 {
    auto &cache    = *cachePtr_;
    auto &modified = *modifiedPtr_;
    auto &index    = *indexPtr_;
    auto &freeInd  = *freePtr_;
    auto &loads    = *loadsPtr_;
    struct stat s;
    DV_DEBUG_MSG(this, "loading " << i << " from disk");
    evict();
    if(stat(filename(i).c_str(), &s) != 0)
    {
        HADRONS_ERROR(Io, "disk vector element " + std::to_string(i) + " uninitialised");
    }
-    load(cache[i], filename(i));
+    index[i] = freeInd.top();
    freeInd.pop();
    load(cache[index.at(i)], filename(i));
    loads.push_back(i);
    modified[index.at(i)] = false;
 }
 template <typename T>
 void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
 {
    auto &cache    = *cachePtr_;
    auto &modified = *modifiedPtr_;
    auto &index    = *indexPtr_;
    auto &freeInd  = *freePtr_;
    auto &loads    = *loadsPtr_;
    evict();
-    cache[i] = obj;
+    index[i] = freeInd.top();
    freeInd.pop();
    cache[index.at(i)] = obj;
    loads.push_back(i);
    modified[index.at(i)] = false;
 #ifdef DV_DEBUG
    std::string msg;
@@ -166,7 +166,13 @@ std::string Hadrons::dirname(const std::string &s)
 void Hadrons::makeFileDir(const std::string filename, GridBase *g)
 {
-    if (g->IsBoss())
+    bool doIt = true;
    if (g)
    {
        doIt = g->IsBoss();
    }
    if (doIt)
    {
        std::string dir    = dirname(filename);
        int         status = mkdir(dir);
@@ -32,6 +32,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <set>
 #include <stack>
 #include <regex>
 #include <Grid/Grid.h>
 #include <cxxabi.h>
@@ -217,15 +218,15 @@ typedef XmlReader ResultReader;
 typedef XmlWriter ResultWriter;
 #endif
-#define RESULT_FILE_NAME(name) \
+#define RESULT_FILE_NAME(name, traj) \
-name + "." + std::to_string(vm().getTrajectory()) + "." + resultFileExt
+name + "." + std::to_string(traj) + "." + resultFileExt
 // recursive mkdir
 #define MAX_PATH_LENGTH 512u
 int         mkdir(const std::string dirName);
 std::string basename(const std::string &s);
 std::string dirname(const std::string &s);
-void        makeFileDir(const std::string filename, GridBase *g);
+void        makeFileDir(const std::string filename, GridBase *g = nullptr);
 // default Schur convention
 #ifndef HADRONS_DEFAULT_SCHUR 
@@ -248,6 +249,47 @@ void        makeFileDir(const std::string filename, GridBase *g);
 // pretty print time profile
 void printTimeProfile(const std::map<std::string, GridTime> &timing, GridTime total);
 // token replacement utility
 template <typename T>
 void tokenReplace(std::string &str, const std::string token,
                  const T &x, const std::string mark = "@")
 {
    std::string fullToken = mark + token + mark;
    auto pos = str.find(fullToken);
    if (pos != std::string::npos)
    {
        str.replace(pos, fullToken.size(), std::to_string(x));
    }
 }
 // trajectory range
 class TrajRange: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
                                    unsigned int, start,
                                    unsigned int, end,
                                    unsigned int, step,
                                    std::string,  exclude);
    inline std::vector<unsigned int> getTrajectoryList(void)
    {
        std::vector<unsigned int> excVec = strToVec<unsigned int>(exclude);
        std::vector<unsigned int> list;
        for (unsigned int t = start; t < end; t += step)
        {
            if (std::find(excVec.begin(), excVec.end(), t) == excVec.end())
            {
                list.push_back(t);
            }
        }
        return list;
    }
 };
 END_HADRONS_NAMESPACE
 #include <Hadrons/Exceptions.hpp>
@@ -5,17 +5,17 @@ lib_LIBRARIES = libHadrons.a
 include modules.inc
 libHadrons_a_SOURCES = \
    $(modules_cc)      \
    Application.cc     \
    Environment.cc     \
 	Exceptions.cc      \
    Global.cc          \
    Module.cc		   \
 	TimerArray.cc      \
-	VirtualMachine.cc
+	VirtualMachine.cc  \
 	$(modules_cc)
 libHadrons_adir = $(includedir)/Hadrons
 nobase_libHadrons_a_HEADERS = \
 	$(modules_hpp)            \
 	A2AVectors.hpp            \
 	A2AMatrix.hpp             \
 	Application.hpp           \
@@ -33,4 +33,6 @@ nobase_libHadrons_a_HEADERS = \
 	ModuleFactory.hpp         \
 	Solver.hpp                \
 	TimerArray.hpp            \
-	VirtualMachine.hpp
+	VirtualMachine.hpp        \
 	Utilities/Contractor.hpp  \
 	$(modules_hpp)
@@ -144,7 +144,7 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
 {\
    makeFileDir(ioStem, env().getGrid());\
    {\
-        ResultWriter _writer(RESULT_FILE_NAME(ioStem));\
+        ResultWriter _writer(RESULT_FILE_NAME(ioStem, vm().getTrajectory()));\
        write(_writer, name, result);\
    }\
 }
@@ -24,9 +24,11 @@
 #include <Hadrons/Modules/MSolver/Guesser.hpp>
 #include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Hadrons/Modules/MSolver/A2AVectors.hpp>
 #include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
 #include <Hadrons/Modules/MGauge/UnitEm.hpp>
 #include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
 #include <Hadrons/Modules/MGauge/Unit.hpp>
 #include <Hadrons/Modules/MGauge/Electrify.hpp>
 #include <Hadrons/Modules/MGauge/Random.hpp>
 #include <Hadrons/Modules/MGauge/GaugeFix.hpp>
 #include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
@@ -49,7 +49,8 @@ public:
                                    unsigned int, Ls,
                                    double      , mass,
                                    double      , M5,
-                                    std::string , boundary);
+                                    std::string , boundary,
                                    std::string , twist);
 };
 template <typename FImpl>
@@ -119,8 +120,9 @@ void TDWF<FImpl>::setup(void)
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
+    typename DomainWallFermion<FImpl>::ImplParams implParams;
-    typename DomainWallFermion<FImpl>::ImplParams implParams(boundary);
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, DomainWallFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, implParams);
 }
@@ -49,7 +49,8 @@ public:
                                    double      , M5,
                                    double      , b,
                                    double      , c,
-                                    std::string , boundary);
+                                    std::string , boundary,
                                    std::string , twist);
 };
 template <typename FImpl>
@@ -119,8 +120,9 @@ void TMobiusDWF<FImpl>::setup(void)
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
+    typename MobiusFermion<FImpl>::ImplParams implParams;
-    typename MobiusFermion<FImpl>::ImplParams implParams(boundary);
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, MobiusFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, par().b, par().c,
                     implParams);
@@ -48,7 +48,8 @@ public:
                                    double      , mass,
                                    double      , M5,
                                    double      , scale,
-                                    std::string , boundary);
+                                    std::string , boundary,
                                    std::string , twist);
 };
 template <typename FImpl>
@@ -118,8 +119,9 @@ void TScaledDWF<FImpl>::setup(void)
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
+    typename ScaledShamirFermion<FImpl>::ImplParams implParams;
-    typename MobiusFermion<FImpl>::ImplParams implParams(boundary);
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, ScaledShamirFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, par().scale,
                     implParams);
@@ -47,7 +47,9 @@ public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonPar,
                                    std::string, gauge,
                                    double     , mass,
-                                    std::string, boundary);
+                                    std::string, boundary,
                                    std::string, string,
                                    std::string, twist);
 };
 template <typename FImpl>
@@ -113,8 +115,9 @@ void TWilson<FImpl>::setup(void)
    auto &U      = envGet(GaugeField, par().gauge);
    auto &grid   = *envGetGrid(FermionField);
    auto &gridRb = *envGetRbGrid(FermionField);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
+    typename WilsonFermion<FImpl>::ImplParams implParams;
-    typename WilsonFermion<FImpl>::ImplParams implParams(boundary);
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, WilsonFermion<FImpl>, getName(), 1, U, grid, gridRb,
                     par().mass, implParams);
 }
@@ -51,7 +51,8 @@ public:
 				                    double     , csw_r,
 				                    double     , csw_t,
 				                    WilsonAnisotropyCoefficients ,clover_anisotropy,
-                                    std::string, boundary
+                                    std::string, boundary,
                                    std::string, twist
 				    );
 };
@@ -119,8 +120,9 @@ void TWilsonClover<FImpl>::setup(void)
    auto &U      = envGet(GaugeField, par().gauge);
    auto &grid   = *envGetGrid(FermionField);
    auto &gridRb = *envGetRbGrid(FermionField);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
+    typename WilsonCloverFermion<FImpl>::ImplParams implParams;
-    typename WilsonCloverFermion<FImpl>::ImplParams implParams(boundary);
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, WilsonCloverFermion<FImpl>, getName(), 1, U, grid,
                     gridRb, par().mass, par().csw_r, par().csw_t, 
                     par().clover_anisotropy, implParams); 
@@ -50,7 +50,8 @@ public:
                                    double                           , b,
                                    double                           , c,
                                    std::vector<std::complex<double>>, omega,
-                                    std::string                      , boundary);
+                                    std::string                      , boundary,
                                    std::string                      , twist);
 };
 template <typename FImpl>
@@ -127,8 +128,9 @@ void TZMobiusDWF<FImpl>::setup(void)
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    auto omega = par().omega;
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
+    typename ZMobiusFermion<FImpl>::ImplParams implParams;
-    typename ZMobiusFermion<FImpl>::ImplParams implParams(boundary);
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    envCreateDerived(FMat, ZMobiusFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, omega,
                     par().b, par().c, implParams);
@@ -33,10 +33,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #ifndef ASF_IO_TYPE
 #define ASF_IO_TYPE ComplexF
 #endif
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
@@ -113,7 +109,7 @@ public:
    typedef A2AMatrixBlockComputation<Complex, 
                                      FermionField, 
                                      A2AAslashFieldMetadata, 
-                                      ASF_IO_TYPE> Computation;
+                                      HADRONS_A2AM_IO_TYPE> Computation;
    typedef AslashFieldKernel<Complex, FImpl> Kernel;
 public:
    // constructor
@@ -196,7 +192,7 @@ void TA2AAslashField<FImpl, PhotonImpl>::execute(void)
        LOG(Message) << "  " << name << std::endl;
    }
    LOG(Message) << "A-slash field size: " << nt << "*" << N_i << "*" << N_j 
-                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(ASF_IO_TYPE)) 
+                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
                 << "/EM field)" << std::endl;
    // preparing "B" complexified fields
@@ -35,10 +35,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #ifndef MF_IO_TYPE
 #define MF_IO_TYPE ComplexF
 #endif
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
@@ -118,7 +114,7 @@ public:
    typedef A2AMatrixBlockComputation<Complex, 
                                      FermionField, 
                                      A2AMesonFieldMetadata, 
-                                      MF_IO_TYPE> Computation;
+                                      HADRONS_A2AM_IO_TYPE> Computation;
    typedef MesonFieldKernel<Complex, FImpl> Kernel;
 public:
    // constructor
@@ -248,7 +244,7 @@ void TA2AMesonField<FImpl>::execute(void)
        LOG(Message) << "  " << g << std::endl;
    }
    LOG(Message) << "Meson field size: " << nt << "*" << N_i << "*" << N_j 
-                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(MF_IO_TYPE)) 
+                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
                 << "/momentum/bilinear)" << std::endl;
    auto &ph = envGet(std::vector<ComplexField>, momphName_);
@@ -0,0 +1,34 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MGauge/Electrify.cc
 Copyright (C) 2015-2018
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MGauge/Electrify.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;
 template class Grid::Hadrons::MGauge::TElectrify<GIMPL>;
@@ -0,0 +1,151 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MGauge/Electrify.hpp
 Copyright (C) 2015-2018
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MGauge_Electrify_hpp_
 #define Hadrons_MGauge_Electrify_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                              Electrify gauge                               *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 /****************************************************************************
 *  Electrify a gauge field:
 *
 *  Ue_mu(x) = U_mu(x)*exp(ieqA_mu(x))
 *
 *  with
 *
 *  - gauge: U_mu(x): gauge field
 *  - emField: A_mu(x): electromagnetic photon field
 *  - e: value for the elementary charge
 *  - q: charge in units of e
 *
 *****************************************************************************/
 class ElectrifyPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ElectrifyPar,
                                    std::string, gauge,
 				    std::string, emField,
 				    double, e,
 				    double, charge);
 };
 template <typename GImpl>
 class TElectrify: public Module<ElectrifyPar>
 {
 public:
    GAUGE_TYPE_ALIASES(GImpl,);
 public:
    typedef PhotonR::GaugeField     EmField;
 public:
    // constructor
    TElectrify(const std::string name);
    // destructor
    virtual ~TElectrify(void) {};
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(Electrify, TElectrify<GIMPL>, MGauge);
 /******************************************************************************
 *                            TElectrify implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename GImpl>
 TElectrify<GImpl>::TElectrify(const std::string name)
 : Module<ElectrifyPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename GImpl>
 std::vector<std::string> TElectrify<GImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge, par().emField};
    return in;
 }
 template <typename GImpl>
 std::vector<std::string> TElectrify<GImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename GImpl>
 void TElectrify<GImpl>::setup(void)
 {
    envCreateLat(GaugeField, getName());
    envTmpLat(LatticeComplex, "eiAmu");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename GImpl>
 void TElectrify<GImpl>::execute(void)
 {
    LOG(Message) << "Electrify the gauge field " << par().gauge << " using the photon field " 
                  << par().emField << " with charge e*q= " << par().e << "*" << par().charge << std::endl;
    auto &Ue = envGet(GaugeField, getName());
    auto &U = envGet(GaugeField, par().gauge);
    auto &A = envGet(EmField,  par().emField);
    envGetTmp(LatticeComplex, eiAmu);
    Complex i(0.0,1.0);
    for(unsigned int mu = 0; mu < env().getNd(); mu++)
    {
 	eiAmu = exp(i * (Real)(par().e * par().charge) * PeekIndex<LorentzIndex>(A, mu));
 	PokeIndex<LorentzIndex>(Ue, PeekIndex<LorentzIndex>(U, mu) * eiAmu, mu);
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MGauge_Electrify_hpp_
@@ -70,7 +70,7 @@ void TStochEm::execute(void)
    LOG(Message) << "Generating stochastic EM potential..." << std::endl;
    std::vector<Real> improvements = strToVec<Real>(par().improvement);
-    PhotonR photon(par().gauge, par().zmScheme, improvements, par().G0_qedInf);
+    PhotonR photon(envGetGrid(EmField), par().gauge, par().zmScheme, improvements);
    auto    &a = envGet(EmField, getName());
    auto    &w = envGet(EmComp, "_" + getName() + "_weight");
@@ -47,8 +47,7 @@ public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(StochEmPar,
                                    PhotonR::Gauge,    gauge,
                                    PhotonR::ZmScheme, zmScheme,
-                                    std::string,       improvement,
+                                    std::string,       improvement);
                                    Real,              G0_qedInf);
 };
 class TStochEm: public Module<StochEmPar>
@@ -62,7 +62,7 @@ void TUnitEm::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 void TUnitEm::execute(void)
 {
-    PhotonR photon(0, 0); // Just chose arbitrary input values here
+    PhotonR photon(envGetGrid(EmField), 0, 0); // Just chose arbitrary input values here
    auto    &a = envGet(EmField, getName());
    LOG(Message) << "Generating unit EM potential..." << std::endl;
    photon.UnitField(a);
@@ -146,7 +146,7 @@ void TChargedProp::execute(void)
        std::vector<int>    siteCoor;
        LOG(Message) << "Saving momentum-projected propagator to '"
-                     << RESULT_FILE_NAME(par().output) << "'..."
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        result.projection.resize(par().outputMom.size());
        result.lattice_size = env().getGrid()->_fdimensions;
@@ -462,7 +462,7 @@ void TScalarVP::execute(void)
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected HVP to '"
-                     << RESULT_FILE_NAME(par().output) << "'..."
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        saveResult(par().output, "HVP", outputData);
    }
@@ -239,7 +239,7 @@ void TVPCounterTerms::execute(void)
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected correlators to '"
-                     << RESULT_FILE_NAME(par().output) << "'..."
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        saveResult(par().output, "scalar_loops", outputData);
    }
@@ -0,0 +1,35 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MSolver/A2AAslashVectors.cc
 Copyright (C) 2015-2018
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MSolver;
 template class Grid::Hadrons::MSolver::TA2AAslashVectors<FIMPL>;
 template class Grid::Hadrons::MSolver::TA2AAslashVectors<ZFIMPL>;
@@ -0,0 +1,194 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MSolver/A2AAslashVectors.hpp
 Copyright (C) 2015-2018
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MSolver_A2AAslashVectors_hpp_
 #define Hadrons_MSolver_A2AAslashVectors_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/Solver.hpp>
 #include <Hadrons/A2AVectors.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                       Create all-to-all V & W vectors                      *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MSolver)
 /****************************************************************************
 *  Calculate a sequential propagator on an insertion of i*g_mu*A_mu 
 *  on an A2A vector
 *
 *  vv_i(y) = S(y,x) * i * g_mu*A_mu(x) * v_i(x)
 *
 *  with
 *
 *  - vector: A2A vector v_i(x)
 *  - emField: A_mu(x): electromagnetic photon field
 *  - solver: the solver for calculating the sequential propagator
 *
 *****************************************************************************/
 class A2AAslashVectorsPar: Serializable
 {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashVectorsPar,
                                  std::string, vector,
                                  std::string, emField,
                                  std::string, solver,
                                  std::string, output,
                                  bool,        multiFile);
 };
 template <typename FImpl>
 class TA2AAslashVectors : public Module<A2AAslashVectorsPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    SOLVER_TYPE_ALIASES(FImpl,);
 public:
    typedef PhotonR::GaugeField EmField;
 public:
    // constructor
    TA2AAslashVectors(const std::string name);
    // destructor
    virtual ~TA2AAslashVectors(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    unsigned int Ls_;
 };
 MODULE_REGISTER_TMP(A2AAslashVectors, TA2AAslashVectors<FIMPL>, MSolver);
 MODULE_REGISTER_TMP(ZA2AAslashVectors, TA2AAslashVectors<ZFIMPL>, MSolver);
 /******************************************************************************
 *                       TA2AAslashVectors implementation                       *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TA2AAslashVectors<FImpl>::TA2AAslashVectors(const std::string name)
 : Module<A2AAslashVectorsPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TA2AAslashVectors<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().vector, par().emField, par().solver};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TA2AAslashVectors<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TA2AAslashVectors<FImpl>::setup(void)
 {
    Ls_  = env().getObjectLs(par().solver);
    auto &vvector = envGet(std::vector<FermionField>, par().vector);
    unsigned int Nmodes = vvector.size();
    envCreate(std::vector<FermionField>, getName(), 1, 
              Nmodes, envGetGrid(FermionField));
    envTmpLat(FermionField, "v4dtmp");
    envTmpLat(FermionField, "v5dtmp", Ls_);
    envTmpLat(FermionField, "v5dtmp_sol", Ls_);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TA2AAslashVectors<FImpl>::execute(void)
 {
    auto &solver = envGet(Solver, par().solver);
    auto &stoch_photon = envGet(EmField,  par().emField);
    auto &vvector = envGet(std::vector<FermionField>, par().vector);
    auto &Aslashv = envGet(std::vector<FermionField>, getName());
    unsigned int Nmodes = vvector.size();
    auto &mat = solver.getFMat();
    envGetTmp(FermionField, v4dtmp);
    envGetTmp(FermionField, v5dtmp);
    envGetTmp(FermionField, v5dtmp_sol);
    Complex ci(0.0,1.0);
    startTimer("Seq Aslash");
    LOG(Message) << "Calculate Sequential propagator on Aslash * v with the A2A vector " 
                 << par().vector << " and the photon field " << par().emField << std::endl;
    for(unsigned int i=0; i<Nmodes; i++)
    {
        v4dtmp = zero;
        startTimer("Multiply Aslash");
        for(unsigned int mu=0;mu<=3;mu++)
        {
            Gamma gmu(Gamma::gmu[mu]);
            v4dtmp +=  ci * PeekIndex<LorentzIndex>(stoch_photon, mu) * (gmu * vvector[i]);
        }
        stopTimer("Multiply Aslash");
        startTimer("Inversion");
        if (Ls_ == 1)
        {
            solver(Aslashv[i], v4dtmp);
        }
        else
        {
            mat.ImportPhysicalFermionSource(v4dtmp, v5dtmp);
            solver(v5dtmp_sol, v5dtmp);
            mat.ExportPhysicalFermionSolution(v5dtmp_sol, v4dtmp);
            Aslashv[i] = v4dtmp;
        }
        stopTimer("Inversion");
    }
    stopTimer("Seq Aslash");
    if (!par().output.empty())
    {
        startTimer("I/O");
        A2AVectorsIo::write(par().output, Aslashv, par().multiFile, vm().getTrajectory());
        stopTimer("I/O");
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MSolver_A2AAslashVectors_hpp_
@@ -0,0 +1,442 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Utilities/Contractor.cc
 Copyright (C) 2015-2018
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Global.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #include <Hadrons/DiskVector.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Hadrons/Utilities/Contractor.hpp>
 #ifdef GRID_COMMS_MPI3
 #define GET_RANK(rank, nMpi) \
 MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
 MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
 #define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
 #define GLOBAL_DSUM(x) MPI_Allreduce(MPI_IN_PLACE, &x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD)
 #define GLOBAL_DMAX(x) MPI_Allreduce(MPI_IN_PLACE, &x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD)
 #define INIT() MPI_Init(NULL, NULL)
 #define FINALIZE() MPI_Finalize()
 #else
 #define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
 #define BARRIER()
 #define GLOBAL_DSUM(x)
 #define GLOBAL_DMAX(x)
 #define INIT()
 #define FINALIZE()
 #endif
 using namespace Grid;
 using namespace QCD;
 using namespace Hadrons;
 #define TIME_MOD(t) (((t) + par.global.nt) % par.global.nt)
 struct ContractorPar
 {
    Contractor::GlobalPar                  global;
    std::vector<Contractor::A2AMatrixPar>  a2aMatrix;
    std::vector<Contractor::ProductPar>    product;
 };
 void makeTimeSeq(std::vector<std::vector<unsigned int>> &timeSeq, 
                 const std::vector<std::set<unsigned int>> &times,
                 std::vector<unsigned int> &current,
                 const unsigned int depth)
 {
    if (depth > 0)
    {
        for (auto t: times[times.size() - depth])
        {
            current[times.size() - depth] = t;
            makeTimeSeq(timeSeq, times, current, depth - 1);
        }
    }
    else
    {
        timeSeq.push_back(current);
    }
 }
 void makeTimeSeq(std::vector<std::vector<unsigned int>> &timeSeq, 
                 const std::vector<std::set<unsigned int>> &times)
 {
    std::vector<unsigned int> current(times.size());
    makeTimeSeq(timeSeq, times, current, times.size());
 }
 void saveCorrelator(const Contractor::CorrelatorResult &result, const std::string dir, 
                    const unsigned int dt, const unsigned int traj)
 {
    std::string              fileStem = "", filename;
    std::vector<std::string> terms = strToVec<std::string>(result.contraction.terms);
    for (unsigned int i = 0; i < terms.size() - 1; i++)
    {
        fileStem += terms[i] + "_" + std::to_string(result.times[i]) + "_";
    }
    fileStem += terms.back();
    if (!result.contraction.translationAverage)
    {
        fileStem += "_dt_" + std::to_string(dt);
    }
    filename = dir + "/" + RESULT_FILE_NAME(fileStem, traj);
    std::cout << "Saving correlator to '" << filename << "'" << std::endl;
    makeFileDir(dir);
    ResultWriter writer(filename);
    write(writer, fileStem, result);
 }
 void printPerf(const double bytes, const double usec)
 {
    double maxt;
    maxt = usec;
    GLOBAL_DMAX(maxt);
    std::cout << maxt/1.0e6 << " sec " << bytes/maxt*1.0e6/1024/1024/1024 << " GB/s";
 }
 void printPerf(const double bytes, const double busec, 
               const double flops, const double fusec)
 {
    double maxt;
    printPerf(bytes, busec);
    std::cout << " ";
    maxt = fusec;
    GLOBAL_DMAX(maxt);
    std::cout << flops/fusec/1.0e3 << " GFlop/s";
 }
 std::set<unsigned int> parseTimeRange(const std::string str, const unsigned int nt)
 {
    std::regex               rex("([0-9]+)|(([0-9]+)\\.\\.([0-9]+))");
    std::smatch              sm;
    std::vector<std::string> rstr = strToVec<std::string>(str);
    std::set<unsigned int>   tSet;
    for (auto &s: rstr)
    {
        std::regex_match(s, sm, rex);
        if (sm[1].matched)
        {
            unsigned int t;
            t = std::stoi(sm[1].str());
            if (t >= nt)
            {
                HADRONS_ERROR(Range, "time out of range (from expression '" + str + "')");
            }
            tSet.insert(t);
        }
        else if (sm[2].matched)
        {
            unsigned int ta, tb;
            ta = std::stoi(sm[3].str());
            tb = std::stoi(sm[4].str());
            if ((ta >= nt) or (tb >= nt))
            {
                HADRONS_ERROR(Range, "time out of range (from expression '" + str + "')");
            }
            for (unsigned int ti = ta; ti <= tb; ++ti)
            {
                tSet.insert(ti);
            }
        }
    }
    return tSet;
 }
 int main(int argc, char* argv[])
 {
    // MPI init
    int nMpi, rank;
    INIT();
    GET_RANK(rank, nMpi);
    if (rank != 0)
    {
        std::cout.setstate(std::ios::badbit);
    }
    // parse command line
    std::string   parFilename;
    if (argc != 2)
    {
        std::cerr << "usage: " << argv[0] << " <parameter file>";
        std::cerr << std::endl;
        return EXIT_FAILURE;
    }
    parFilename = argv[1];
    // parse parameter file
    ContractorPar par;
    unsigned int  nMat, nCont;
    XmlReader     reader(parFilename);
    read(reader, "global",    par.global);
    read(reader, "a2aMatrix", par.a2aMatrix);
    read(reader, "product",   par.product);
    nMat  = par.a2aMatrix.size();
    nCont = par.product.size();
    // create diskvectors
    std::map<std::string, EigenDiskVector<ComplexD>> a2aMat;
    unsigned int                                     cacheSize;
    for (auto &p: par.a2aMatrix)
    {
        std::string dirName = par.global.diskVectorDir + "/" + p.name + "." + std::to_string(rank);
        a2aMat.emplace(p.name, EigenDiskVector<ComplexD>(dirName, par.global.nt, p.cacheSize));
    }
    // trajectory loop
    std::vector<unsigned int> tList = par.global.trajCounter.getTrajectoryList();
    unsigned int              indi, inde, indPerRank;
    indPerRank = tList.size()/nMpi;
    indi       = rank*indPerRank;
    BARRIER();
    for (unsigned int tInd = indi; tInd < indi + indPerRank; tInd++)
    {
        unsigned int traj;
        if (tInd < tList.size())
        {
            traj = tList[tInd];
        }
        else
        {
            traj = tList.back();
        }
        if (nMpi > 1)
        {
            if (rank == 0)
            {
                std::cout << ":::::::: Trajectories ";
                for (unsigned int r = 0; r < nMpi - 1; ++r)
                {
                    std::cout << tList[tInd + r*indPerRank] << " ";
                }
                if (tInd + (nMpi - 1)*indPerRank < tList.size())
                {
                    std::cout << tList[tInd + (nMpi - 1)*indPerRank];
                }
                std::cout << std::endl;
            }
        }
        else
        {
            std::cout << ":::::::: Trajectory " << traj << std::endl;
        }
        // load data
        for (auto &p: par.a2aMatrix)
        {
            std::string filename = p.file;
            double      t;
            tokenReplace(filename, "traj", traj);
            std::cout << "======== Loading '" << p.file << "'" << std::endl;
            BARRIER();
            A2AMatrixIo<HADRONS_A2AM_IO_TYPE> a2aIo(filename, p.dataset, par.global.nt);
            a2aIo.load(a2aMat.at(p.name), &t);
            GLOBAL_DMAX(t);
            std::cout << "Read "  << nMpi*a2aIo.getSize() << " bytes in " << t/1.0e6 
                      << " sec, " << nMpi*a2aIo.getSize()/t*1.0e6/1024/1024 
                      << " MB/s"  << std::endl;
        }
        // contract
        EigenDiskVector<ComplexD>::Matrix buf;
        for (auto &p: par.product)
        {
            std::vector<std::string>               term = strToVec<std::string>(p.terms);
            std::vector<std::set<unsigned int>>    times;
            std::vector<std::vector<unsigned int>> timeSeq;
            std::set<unsigned int>                 translations;
            std::vector<A2AMatrixTr<ComplexD>>     lastTerm(par.global.nt);
            A2AMatrix<ComplexD>                    prod, buf, tmp;
            TimerArray                             tAr;
            double                                 fusec, busec, flops, bytes, tusec;
            Contractor::CorrelatorResult           result;             
            BARRIER();
            tAr.startTimer("Total");
            std::cout << "======== Contraction tr(";
            for (unsigned int g = 0; g < term.size(); ++g)
            {
                std::cout << term[g] << ((g == term.size() - 1) ? ')' : '*');
            }
            std::cout << std::endl;
            if (term.size() != p.times.size() + 1)
            {
                HADRONS_ERROR(Size, "number of terms (" + std::to_string(term.size()) 
                            + ") different from number of times (" 
                            + std::to_string(p.times.size() + 1) + ")");
            }
            for (auto &s: p.times)
            {
                times.push_back(parseTimeRange(s, par.global.nt));
            }
            for (auto &m: par.a2aMatrix)
            {
                if (std::find(result.a2aMatrix.begin(), result.a2aMatrix.end(), m) == result.a2aMatrix.end())
                {
                    result.a2aMatrix.push_back(m);
                    tokenReplace(result.a2aMatrix.back().file, "traj", traj);
                }
            }
            result.contraction = p;
            result.correlator.resize(par.global.nt, 0.);
            translations = parseTimeRange(p.translations, par.global.nt);
            makeTimeSeq(timeSeq, times);
            std::cout << timeSeq.size()*translations.size()*(term.size() - 2) << " A*B, "
                    << timeSeq.size()*translations.size()*par.global.nt << " tr(A*B)"
                    << std::endl;
            std::cout << "* Caching transposed last term" << std::endl;
            for (unsigned int t = 0; t < par.global.nt; ++t)
            {
                tAr.startTimer("Disk vector overhead");
                const A2AMatrix<ComplexD> &ref = a2aMat.at(term.back())[t];
                tAr.stopTimer("Disk vector overhead");
                tAr.startTimer("Transpose caching");
                lastTerm[t].resize(ref.rows(), ref.cols());
                parallel_for (unsigned int j = 0; j < ref.cols(); ++j)
                for (unsigned int i = 0; i < ref.rows(); ++i)
                {
                    lastTerm[t](i, j) = ref(i, j);
                }
                tAr.stopTimer("Transpose caching");
            }
            bytes  = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols();
            bytes *= sizeof(ComplexD)*nMpi;
            printPerf(bytes, tAr.getDTimer("Transpose caching"));
            std::cout << std::endl;
            for (unsigned int i = 0; i < timeSeq.size(); ++i)
            {
                unsigned int dti = 0;
                auto         &t = timeSeq[i];
                result.times = t;
                for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
                {
                    result.correlator[tLast] = 0.;
                }
                for (auto &dt: translations)
                {
                    std::cout << "* Step " << i*translations.size() + dti + 1
                            << "/" << timeSeq.size()*translations.size()
                            << " -- positions= " << t << ", dt= " << dt << std::endl;
                    if (term.size() > 2)
                    {
                        std::cout << std::setw(10) << "products ";
                    }
                    flops  = 0.;
                    bytes  = 0.;
                    fusec  = tAr.getDTimer("A*B algebra");
                    busec  = tAr.getDTimer("A*B total");
                    tAr.startTimer("Linear algebra");
                    tAr.startTimer("Disk vector overhead");
                    prod = a2aMat.at(term[0])[TIME_MOD(t[0] + dt)];
                    tAr.stopTimer("Disk vector overhead");
                    for (unsigned int j = 1; j < term.size() - 1; ++j)
                    {
                        tAr.startTimer("Disk vector overhead");
                        const A2AMatrix<ComplexD> &ref = a2aMat.at(term[j])[TIME_MOD(t[j] + dt)];
                        tAr.stopTimer("Disk vector overhead");
                        tAr.startTimer("A*B total");
                        tAr.startTimer("A*B algebra");
                        A2AContraction::mul(tmp, prod, ref);
                        tAr.stopTimer("A*B algebra");
                        flops += A2AContraction::mulFlops(prod, ref);
                        prod   = tmp;
                        tAr.stopTimer("A*B total");
                        bytes += 3.*tmp.rows()*tmp.cols()*sizeof(ComplexD);
                    }
                    if (term.size() > 2)
                    {
                        printPerf(bytes*nMpi, tAr.getDTimer("A*B total") - busec,
                                  flops*nMpi, tAr.getDTimer("A*B algebra") - fusec);
                        std::cout << std::endl;
                    }
                    std::cout << std::setw(10) << "traces ";
                    flops  = 0.;
                    bytes  = 0.;
                    fusec  = tAr.getDTimer("tr(A*B)");
                    busec  = tAr.getDTimer("tr(A*B)");
                    for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
                    {
                        tAr.startTimer("tr(A*B)");
                        A2AContraction::accTrMul(result.correlator[TIME_MOD(tLast - dt)], prod, lastTerm[tLast]);
                        tAr.stopTimer("tr(A*B)");
                        flops += A2AContraction::accTrMulFlops(prod, lastTerm[tLast]);
                        bytes += 2.*prod.rows()*prod.cols()*sizeof(ComplexD);
                    }
                    tAr.stopTimer("Linear algebra");
                    printPerf(bytes*nMpi, tAr.getDTimer("tr(A*B)") - busec,
                              flops*nMpi, tAr.getDTimer("tr(A*B)") - fusec);
                    std::cout << std::endl;
                    if (!p.translationAverage)
                    {
                        saveCorrelator(result, par.global.output, dt, traj);
                        for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
                        {
                            result.correlator[tLast] = 0.;
                        }
                    }
                    dti++;
                }
                if (p.translationAverage)
                {
                    for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
                    {
                        result.correlator[tLast] /= translations.size();
                    }
                    saveCorrelator(result, par.global.output, 0, traj);
                }
            }
            tAr.stopTimer("Total");
            printTimeProfile(tAr.getTimings(), tAr.getTimer("Total"));
        }
    }
    FINALIZE();
    return EXIT_SUCCESS;
 }
@@ -0,0 +1,53 @@
 #ifndef  Hadrons_Contractor_hpp_
 #define Hadrons_Contractor_hpp_
 #include <Hadrons/Global.hpp>
 BEGIN_HADRONS_NAMESPACE
 namespace Contractor
 {   
    class GlobalPar: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar,
                                        TrajRange, trajCounter,
                                        unsigned int, nt,
                                        std::string, diskVectorDir,
                                        std::string, output);
    };
    class A2AMatrixPar: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMatrixPar,
                                        std::string, file,
                                        std::string, dataset,
                                        unsigned int, cacheSize,
                                        std::string, name);
    };
    class ProductPar: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(ProductPar,
                                        std::string, terms,
                                        std::vector<std::string>, times,
                                        std::string, translations,
                                        bool, translationAverage);
    };
    class CorrelatorResult: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(CorrelatorResult,
                                        std::vector<Contractor::A2AMatrixPar>,  a2aMatrix,
                                        ProductPar, contraction,
                                        std::vector<unsigned int>, times,
                                        std::vector<ComplexD>, correlator);
    };
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Contractor_hpp_
@@ -0,0 +1,434 @@
 #include <Hadrons/Global.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #ifdef USE_MKL
 #include "mkl.h"
 #include "mkl_cblas.h"
 #endif
 using namespace Grid;
 using namespace Hadrons;
 #ifdef GRID_COMMS_MPI3
 #define GET_RANK(rank, nMpi) \
 MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
 MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
 #define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
 #define INIT() MPI_Init(NULL, NULL)
 #define FINALIZE() MPI_Finalize()
 #else
 #define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
 #define BARRIER()
 #define INIT()
 #define FINALIZE()
 #endif
 template <typename Function, typename MatLeft, typename MatRight>
 inline void trBenchmark(const std::string name, const MatLeft &left,
                        const MatRight &right, const ComplexD ref, Function fn)
 {
    double       t, flops, bytes, n = left[0].rows()*left[0].cols();
    unsigned int nMat = left.size();
    int          nMpi, rank;
    ComplexD     buf;
    t = 0.;
    GET_RANK(rank, nMpi);
    t = -usecond();
    BARRIER();
    for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
    {
        fn(buf, left[i], right[i]);      
    }
    BARRIER();
    t += usecond();
    flops = nMat*(6.*n + 2.*(n - 1.));
    bytes = nMat*(2.*n*sizeof(ComplexD));
    if (rank == 0)
    {
        std::cout << std::setw(34) << name << ": diff= "
                  << std::setw(12) << std::norm(buf-ref)
                  << std::setw(10) << t/1.0e6 << " sec "
                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
                  << std::endl;
    }
    ::sleep(1);
 }
 template <typename Function, typename MatV, typename Mat>
 inline void mulBenchmark(const std::string name, const MatV &left,
                         const MatV &right, const Mat &ref, Function fn)
 {
    double       t, flops, bytes;
    double       nr = left[0].rows(), nc = left[0].cols(), n = nr*nc;
    unsigned int nMat = left.size();
    int          nMpi, rank;
    Mat          buf(left[0].rows(), left[0].rows());
    t = 0.;
    GET_RANK(rank, nMpi);
    t = -usecond();
    BARRIER();
    for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
    {
        fn(buf, left[i], right[i]);
    }
    BARRIER();
    t += usecond();
    flops = nMat*(nr*nr*(6.*nc + 2.*(nc - 1.)));
    bytes = nMat*(2*nc*nr*sizeof(ComplexD));
    if (rank == 0)
    {
        std::cout << std::setw(34) << name << ": diff= "
                  << std::setw(12) << (buf-ref).squaredNorm()
                  << std::setw(10) << t/1.0e6 << " sec "
                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
                  << std::endl;
    }
    ::sleep(1);
 }
 #ifdef USE_MKL
 template <typename MatLeft, typename MatRight>
 static inline void zdotuRow(ComplexD &res, const unsigned int aRow,
                            const MatLeft &a, const MatRight &b)
 {
    const ComplexD *aPt, *bPt;
    unsigned int   aInc, bInc;
    if (MatLeft::Options == Eigen::RowMajor)
    {
        aPt  = a.data() + aRow*a.cols();
        aInc = 1;
    }
    else if (MatLeft::Options == Eigen::ColMajor)
    {
        aPt  = a.data() + aRow;
        aInc = a.rows();
    }
    if (MatRight::Options == Eigen::RowMajor)
    {
        bPt  = b.data() + aRow;
        bInc = b.cols();
    }
    else if (MatRight::Options == Eigen::ColMajor)
    {
        bPt  = b.data() + aRow*b.rows();
        bInc = 1;
    }
    cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
 }
 template <typename MatLeft, typename MatRight>
 static inline void zdotuCol(ComplexD &res, const unsigned int aCol,
                            const MatLeft &a, const MatRight &b)
 {
    const ComplexD *aPt, *bPt;
    unsigned int   aInc, bInc;
    if (MatLeft::Options == Eigen::RowMajor)
    {
        aPt  = a.data() + aCol;
        aInc = a.cols();
    }
    else if (MatLeft::Options == Eigen::ColMajor)
    {
        aPt  = a.data() + aCol*a.rows();
        aInc = 1;
    }
    if (MatRight::Options == Eigen::RowMajor)
    {
        bPt  = b.data() + aCol*b.cols();
        bInc = 1;
    }
    else if (MatRight::Options == Eigen::ColMajor)
    {
        bPt  = b.data() + aCol;
        bInc = b.rows();
    }
    cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
 }
 #endif
 template <typename MatLeft, typename MatRight>
 void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
 {
    std::vector<MatLeft>  left;
    std::vector<MatRight> right;
    MatRight              buf;
    ComplexD              ref;
    int                   rank, nMpi;
    left.resize(nMat, MatLeft::Random(ni, nj));
    right.resize(nMat, MatRight::Random(nj, ni));
    GET_RANK(rank, nMpi);
    if (rank == 0)
    {
        std::cout << "==== tr(A*B) benchmarks" << std::endl;
        std::cout << "A matrices use ";
        if (MatLeft::Options == Eigen::RowMajor)
        {
            std::cout << "row-major ordering" << std::endl;
        }
        else if (MatLeft::Options == Eigen::ColMajor)
        {
            std::cout << "col-major ordering" << std::endl;
        }
        std::cout << "B matrices use ";
        if (MatRight::Options == Eigen::RowMajor)
        {
            std::cout << "row-major ordering" << std::endl;
        }
        else if (MatRight::Options == Eigen::ColMajor)
        {
            std::cout << "col-major ordering" << std::endl;
        }
        std::cout << std::endl;
    }
    BARRIER();
    ref = (left.back()*right.back()).trace();
    trBenchmark("Hadrons A2AContraction::accTrMul", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    { 
        res = 0.;
        A2AContraction::accTrMul(res, a, b);
    });
    trBenchmark("Naive loop rows first", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    { 
        auto nr = a.rows(), nc = a.cols();
        res = 0.;
        parallel_for (unsigned int i = 0; i < nr; ++i)
        {
            ComplexD tmp = 0.;
            for (unsigned int j = 0; j < nc; ++j)
            {
                tmp += a(i, j)*b(j, i);
            }
            parallel_critical
            {
                res += tmp;
            }
        }
    });
    trBenchmark("Naive loop cols first", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        auto nr = a.rows(), nc = a.cols();
        res = 0.;
        parallel_for (unsigned int j = 0; j < nc; ++j)
        {
            ComplexD tmp = 0.;
            for (unsigned int i = 0; i < nr; ++i)
            {
                tmp += a(i, j)*b(j, i);
            }        
            parallel_critical
            {
                res += tmp;
            }
        }
    });
    trBenchmark("Eigen tr(A*B)", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    { 
        res = (a*b).trace();
    });
    trBenchmark("Eigen row-wise dot", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        res = 0.;
        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
        {
            ComplexD tmp;
            tmp = a.row(r).conjugate().dot(b.col(r));
            parallel_critical
            {
                res += tmp;
            }
        }
    });
    trBenchmark("Eigen col-wise dot", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        res = 0.;
        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
        {
            ComplexD tmp;
            tmp = a.col(c).conjugate().dot(b.row(c));
            parallel_critical
            {
                res += tmp;
            }
        }
    });
    trBenchmark("Eigen Hadamard", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    { 
        res = a.cwiseProduct(b.transpose()).sum();
    });
 #ifdef USE_MKL
    trBenchmark("MKL row-wise zdotu", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        res = 0.;
        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
        {
            ComplexD tmp;
            zdotuRow(tmp, r, a, b);
            parallel_critical
            {
                res += tmp;
            }
        }
    });
    trBenchmark("MKL col-wise zdotu", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        res = 0.;
        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
        {
            ComplexD tmp;
            zdotuCol(tmp, c, a, b);
            parallel_critical
            {
                res += tmp;
            }
        }
    });
 #endif
    BARRIER();
    if (rank == 0)
    {
        std::cout << std::endl;
    }
 }
 template <typename Mat>
 void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
 {
    std::vector<Mat> left, right;
    Mat              ref;
    int              rank, nMpi;
    left.resize(nMat, Mat::Random(ni, nj));
    right.resize(nMat, Mat::Random(nj, ni));
    GET_RANK(rank, nMpi);
    if (rank == 0)
    {
        std::cout << "==== A*B benchmarks" << std::endl;
        std::cout << "all matrices use ";
        if (Mat::Options == Eigen::RowMajor)
        {
            std::cout << "row-major ordering" << std::endl;
        }
        else if (Mat::Options == Eigen::ColMajor)
        {
            std::cout << "col-major ordering" << std::endl;
        }
        std::cout << std::endl;
    }
    BARRIER();
    ref = left.back()*right.back();
    mulBenchmark("Hadrons A2AContraction::mul", left, right, ref,
    [](Mat &res, const Mat &a, const Mat &b)
    { 
        A2AContraction::mul(res, a, b);
    });
    mulBenchmark("Eigen A*B", left, right, ref,
    [](Mat &res, const Mat &a, const Mat &b)
    { 
        res = a*b;
    });
 #ifdef USE_MKL
    mulBenchmark("MKL A*B", left, right, ref,
    [](Mat &res, const Mat &a, const Mat &b)
    {
        const ComplexD one(1., 0.), zero(0., 0.);
        if (Mat::Options == Eigen::RowMajor)
        {
            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
                        res.data(), res.cols());
        }
        else if (Mat::Options == Eigen::ColMajor)
        {
            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
                        res.data(), res.rows());
        }
    });
 #endif
    BARRIER();
    if (rank == 0)
    {
        std::cout << std::endl;
    }
 }
 int main(int argc, char *argv[])
 {
    // parse command line
    Eigen::Index ni, nj, nMat;
    int          nMpi, rank;
    if (argc != 4)
    {
        std::cerr << "usage: " << argv[0] << " <Ni> <Nj> <#matrices>";
        std::cerr << std::endl;
        return EXIT_FAILURE;
    }
    ni   = std::stoi(argv[1]);
    nj   = std::stoi(argv[2]);
    nMat = std::stoi(argv[3]);
    INIT();
    GET_RANK(rank, nMpi);
    if (rank == 0)
    {
        std::cout << "\n*** ALL-TO-ALL MATRIX CONTRACTION BENCHMARK ***\n" << std::endl;
        std::cout << nMat << " couples of " << ni << "x" << nj << " matrices\n" << std::endl;
        std::cout << nMpi << " MPI processes" << std::endl;
 #ifdef GRID_OMP
        #pragma omp parallel
        {
            #pragma omp single
            std::cout << omp_get_num_threads() << " threads\n" << std::endl; 
        }
 #else
        std::cout << "Single-threaded\n" << std::endl; 
 #endif
 #ifdef EIGEN_USE_MKL_ALL
        std::cout << "Eigen uses the MKL" << std::endl;
 #endif
        std::cout << "Eigen uses " << Eigen::nbThreads() << " threads" << std::endl;
 #ifdef USE_MKL
        std::cout << "MKL   uses " << mkl_get_max_threads() << " threads" << std::endl;
 #endif
        std::cout << std::endl;
    }
    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
    fullMulBenchmark<A2AMatrix<ComplexD>>(ni, nj, nMat);
    fullMulBenchmark<A2AMatrixTr<ComplexD>>(ni, nj, nMat);
    FINALIZE();
    return EXIT_SUCCESS;
 }
@@ -1,4 +1,4 @@
-bin_PROGRAMS = HadronsXmlRun HadronsFermionEP64To32
+bin_PROGRAMS = HadronsXmlRun HadronsFermionEP64To32 HadronsContractor HadronsContractorBenchmark
 HadronsXmlRun_SOURCES = HadronsXmlRun.cc
 HadronsXmlRun_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
@@ -6,3 +6,9 @@ HadronsXmlRun_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
 HadronsFermionEP64To32_SOURCES  = EigenPackCast.cc
 HadronsFermionEP64To32_CXXFLAGS = $(AM_CXXFLAGS) -DFIN=WilsonImplD::FermionField -DFOUT=WilsonImplF::FermionField
 HadronsFermionEP64To32_LDADD    = ../libHadrons.a ../../Grid/libGrid.a
 HadronsContractor_SOURCES = Contractor.cc Contractor.hpp
 HadronsContractor_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
 HadronsContractorBenchmark_SOURCES = ContractorBenchmark.cc
 HadronsContractorBenchmark_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
@@ -20,11 +20,13 @@ modules_cc =\
  Modules/MSink/Point.cc \
  Modules/MSink/Smear.cc \
  Modules/MSolver/A2AVectors.cc \
  Modules/MSolver/A2AAslashVectors.cc \
  Modules/MSolver/RBPrecCG.cc \
  Modules/MSolver/MixedPrecisionRBPrecCG.cc \
  Modules/MSolver/LocalCoherenceLanczos.cc \
  Modules/MGauge/StoutSmearing.cc \
  Modules/MGauge/Unit.cc \
  Modules/MGauge/Electrify.cc \
  Modules/MGauge/UnitEm.cc \
  Modules/MGauge/StochEm.cc \
  Modules/MGauge/Random.cc \
@@ -95,9 +97,11 @@ modules_hpp =\
  Modules/MSolver/Guesser.hpp \
  Modules/MSolver/RBPrecCG.hpp \
  Modules/MSolver/A2AVectors.hpp \
  Modules/MSolver/A2AAslashVectors.hpp \
  Modules/MGauge/UnitEm.hpp \
  Modules/MGauge/StoutSmearing.hpp \
  Modules/MGauge/Unit.hpp \
  Modules/MGauge/Electrify.hpp \
  Modules/MGauge/Random.hpp \
  Modules/MGauge/GaugeFix.hpp \
  Modules/MGauge/FundtoHirep.hpp \
@@ -123,10 +123,13 @@ case ${ac_SFW_FP16} in
      AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
 esac
-############### MKL
+############### Intel libraries
 AC_ARG_ENABLE([mkl],
    [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
    [ac_MKL=${enable_mkl}], [ac_MKL=no])
 AC_ARG_ENABLE([ipp],
    [AC_HELP_STRING([--enable-ipp=yes|no|prefix], [enable Intel IPP for fast CRC32C])],
    [ac_IPP=${enable_mkl}], [ac_IPP=no])
 case ${ac_MKL} in
    no)
@@ -139,6 +142,17 @@ case ${ac_MKL} in
        AC_DEFINE([USE_MKL], [1], [Define to 1 if you use the Intel MKL]);;
 esac
 case ${ac_IPP} in
    no)
        ;;
    yes)
        AC_DEFINE([USE_IPP], [1], [Define to 1 if you use the Intel IPP]);;
    *)
        AM_CXXFLAGS="-I$ac_IPP/include $AM_CXXFLAGS"
        AM_LDFLAGS="-L$ac_IPP/lib $AM_LDFLAGS"
        AC_DEFINE([USE_IPP], [1], [Define to 1 if you use the Intel IPP]);;
 esac
 ############### HDF5
 AC_ARG_WITH([hdf5],
    [AS_HELP_STRING([--with-hdf5=prefix],
@@ -170,7 +184,13 @@ AC_CHECK_FUNCS([gettimeofday])
 if test "${ac_MKL}x" != "nox"; then
    AC_SEARCH_LIBS([mkl_set_interface_layer], [mkl_rt], [],
-                   [AC_MSG_ERROR("MKL enabled but library not found")])
+                   [AC_MSG_ERROR("Intel MKL enabled but library not found")])
 fi
 if test "${ac_IPP}x" != "nox"; then
     AC_SEARCH_LIBS([ippsCRC32C_8u], [ippdc], 
                   [LIBS="${LIBS} -lippdc -lippvm -lipps -lippcore"],
                   [AC_MSG_ERROR("Intel IPP enabled but library not found")])
 fi
 AC_SEARCH_LIBS([__gmpf_init], [gmp],
@@ -485,6 +505,7 @@ DX_INIT_DOXYGEN([$PACKAGE_NAME], [doxygen.cfg])
 ############### Ouput
 cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
 GRID_CXX="$CXX"
 GRID_CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
 GRID_LDFLAGS="$AM_LDFLAGS $LDFLAGS"
 GRID_LIBS=$LIBS
@@ -497,6 +518,7 @@ AM_LDFLAGS="-L${cwd}/Grid $AM_LDFLAGS"
 AC_SUBST([AM_CFLAGS])
 AC_SUBST([AM_CXXFLAGS])
 AC_SUBST([AM_LDFLAGS])
 AC_SUBST([GRID_CXX])
 AC_SUBST([GRID_CXXFLAGS])
 AC_SUBST([GRID_LDFLAGS])
 AC_SUBST([GRID_LIBS])
@@ -61,6 +61,10 @@ while test $# -gt 0; do
      echo @GRID_CXXFLAGS@
    ;;
    --cxx)
      echo @GRID_CXX@
    ;;
    --ldflags)
      echo @GRID_LDFLAGS@
    ;;
@@ -0,0 +1,138 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: tests/core/Test_qed.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <J.Harrison@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace QCD;
 typedef PeriodicGaugeImpl<QedGImplR>  QedPeriodicGImplR;
 typedef PhotonR::GaugeField           EmField;
 typedef PhotonR::GaugeLinkField       EmComp;
 const int NCONFIGS = 20;
 const int NWILSON  = 10;
 int main(int argc, char *argv[])
 {
  // initialization
  Grid_init(&argc, &argv);
  std::cout << GridLogMessage << "Grid initialized" << std::endl;
  // QED stuff
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(4, vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian    grid(latt_size,simd_layout,mpi_layout);
  GridParallelRNG  pRNG(&grid);
  PhotonR          photon(&grid, PhotonR::Gauge::coulomb, PhotonR::ZmScheme::qedL);
  EmField          a(&grid);
  EmField          expA(&grid);
  Complex imag_unit(0, 1);
  Real wlA;
  std::vector<Real> logWlAvg(NWILSON, 0.0), logWlTime(NWILSON, 0.0), logWlSpace(NWILSON, 0.0);
  pRNG.SeedFixedIntegers({1, 2, 3, 4});
  std::cout << GridLogMessage << "Wilson loop calculation beginning" << std::endl;
  for(int ic = 0; ic < NCONFIGS; ic++){
      std::cout << GridLogMessage << "Configuration " << ic <<std::endl;
      photon.StochasticField(a, pRNG);
      // Exponentiate photon field
      expA = exp(imag_unit*a);
      // Calculate zero-modes
      std::vector<EmField::vector_object::scalar_object> zm;
      std::cout << GridLogMessage << "Total zero-mode norm 2 " 
                << std::sqrt(norm2(sum(a))) << std::endl;
      std::cout << GridLogMessage << "Spatial zero-mode norm 2" << std::endl;
      sliceSum(a, zm, grid.Nd() - 1);
      for (unsigned int t = 0; t < latt_size.back(); ++t)
      {
        std::cout << GridLogMessage << "t = " << t << " " << std::sqrt(norm2(zm[t])) << std::endl;
      }
      // Calculate divergence
      EmComp diva(&grid), amu(&grid);
      diva = zero;
      for (unsigned int mu = 0; mu < grid.Nd(); ++mu)
      {
        amu   = peekLorentz(a, mu);
        diva += amu - Cshift(amu, mu, -1);
        if (mu == grid.Nd() - 2)
        {
          std::cout << GridLogMessage << "Spatial divergence norm 2 " << std::sqrt(norm2(diva)) << std::endl;
        }
      }
      std::cout << GridLogMessage << "Total divergence norm 2 " << std::sqrt(norm2(diva)) << std::endl;
      // Calculate Wilson loops
      for(int iw=1; iw<=NWILSON; iw++){
          wlA = WilsonLoops<QedPeriodicGImplR>::avgWilsonLoop(expA, iw, iw) * 3;
          logWlAvg[iw-1] -= 2*log(wlA);
          wlA = WilsonLoops<QedPeriodicGImplR>::avgTimelikeWilsonLoop(expA, iw, iw) * 3;
          logWlTime[iw-1] -= 2*log(wlA);
          wlA = WilsonLoops<QedPeriodicGImplR>::avgSpatialWilsonLoop(expA, iw, iw) * 3;
          logWlSpace[iw-1] -= 2*log(wlA);
      }
  }
  std::cout << GridLogMessage << "Wilson loop calculation completed" << std::endl;
  // Calculate Wilson loops
  // From A. Portelli's PhD thesis:
  // size  -2*log(W)
  // 1     0.500000000(1)
  // 2     1.369311535(1) 
  // 3     2.305193057(1) 
  // 4     3.261483854(1) 
  // 5     4.228829967(1) 
  // 6     5.203604529(1) 
  // 7     6.183728249(1) 
  // 8     7.167859805(1) 
  // 9     8.155091868(1) 
  // 10    9.144788116(1)
  for(int iw=1; iw<=10; iw++){
      std::cout << GridLogMessage << iw << 'x' << iw << " Wilson loop" << std::endl;
      std::cout << GridLogMessage << "-2*log(W) average: " << logWlAvg[iw-1]/NCONFIGS << std::endl;
      std::cout << GridLogMessage << "-2*log(W) timelike: " << logWlTime[iw-1]/NCONFIGS << std::endl;
      std::cout << GridLogMessage << "-2*log(W) spatial: " << logWlSpace[iw-1]/NCONFIGS << std::endl;
  }
  // epilogue
  std::cout << GridLogMessage << "Grid is finalizing now" << std::endl;
  Grid_finalize();
  return EXIT_SUCCESS;
 }
@@ -0,0 +1,104 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_dwf_mrhs_cg.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  typedef LatticeComplex ComplexField; 
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  int nd   = latt_size.size();
  int ndm1 = nd-1;
  std::vector<int> simd_layout = GridDefaultSimd(nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> mpi_split (mpi_layout.size(),1);
  std::cout << " Full " << GridCmdVectorIntToString(latt_size)  << " subgrid"         <<std::endl;
  std::cout << " Full " << GridCmdVectorIntToString(mpi_layout) << " sub communicator"<<std::endl;
  std::cout << " Full " << GridCmdVectorIntToString(simd_layout)<< " simd layout "    <<std::endl;
  GridCartesian         * GridN = new GridCartesian(latt_size,
 						    simd_layout,
 						    mpi_layout);
  std::vector<int> latt_m  = latt_size;   latt_m[nd-1] = 1;
  std::vector<int> mpi_m   = mpi_layout;  mpi_m [nd-1] = 1;
  std::vector<int> simd_m  = GridDefaultSimd(ndm1,vComplex::Nsimd()); simd_m.push_back(1);
  std::cout << " Requesting " << GridCmdVectorIntToString(latt_m)<< " subgrid"         <<std::endl;
  std::cout << " Requesting " << GridCmdVectorIntToString(mpi_m) << " sub communicator"<<std::endl;
  std::cout << " Requesting " << GridCmdVectorIntToString(simd_m)<< " simd layout "    <<std::endl;
  GridCartesian         * Grid_m = new GridCartesian(latt_m,
 						     simd_m,
 						     mpi_m,
 						     *GridN); 
  Complex C(1.0);
  Complex tmp;
  ComplexField Full(GridN); Full = C;
  ComplexField Full_cpy(GridN);
  ComplexField Split(Grid_m);Split= C;
  std::cout << GridLogMessage<< " Full  volume "<< norm2(Full) <<std::endl;
  std::cout << GridLogMessage<< " Split volume "<< norm2(Split) <<std::endl;
  tmp=C;
  GridN->GlobalSum(tmp);
  std::cout << GridLogMessage<< " Full  nodes "<< tmp <<std::endl;
  tmp=C;
  Grid_m->GlobalSum(tmp);
  std::cout << GridLogMessage<< " Split nodes "<< tmp <<std::endl;
  GridN->Barrier();
  auto local_latt = GridN->LocalDimensions();
  Full_cpy = zero;
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          RNG(GridN);  RNG.SeedFixedIntegers(seeds);
  random(RNG,Full);
  for(int t=0;t<local_latt[nd-1];t++){
    ExtractSliceLocal(Split,Full,0,t,Tp);
    InsertSliceLocal (Split,Full_cpy,0,t,Tp);
  }
  Full_cpy = Full_cpy - Full;
  std::cout << " NormFull " << norm2(Full)<<std::endl;
  std::cout << " NormDiff " << norm2(Full_cpy)<<std::endl;
  Grid_finalize();
 }
@@ -72,6 +72,7 @@ int main(int argc, char *argv[])
    // set fermion boundary conditions to be periodic space, antiperiodic time.
    std::string boundary = "1 1 1 -1";
    std::string twist    = "0. 0. 0. 0.";
    //stochastic photon field
    MGauge::StochEm::Par photonPar;
@@ -90,6 +91,7 @@ int main(int argc, char *argv[])
        actionPar.M5    = 1.8;
        actionPar.mass  = mass[i];
        actionPar.boundary = boundary;
        actionPar.twist = "0. 0. 0. 0.";
        application.createModule<MAction::DWF>("DWF_" + flavour[i], actionPar);
@@ -126,6 +126,7 @@ inline void makeWilsonAction(Application &application, std::string actionName,
        actionPar.gauge = gaugeField;
        actionPar.mass  = mass;
        actionPar.boundary = boundary;
        actionPar.twist = "0. 0. 0. 0.";
        application.createModule<MAction::Wilson>(actionName, actionPar);
    }
 }
@@ -154,6 +155,7 @@ inline void makeDWFAction(Application &application, std::string actionName,
        actionPar.M5    = M5;
        actionPar.mass  = mass;
        actionPar.boundary = boundary;
        actionPar.twist = "0. 0. 0. 0.";
        application.createModule<MAction::DWF>(actionName, actionPar);
    }
 }
@@ -66,6 +66,7 @@ int main(int argc, char *argv[])
    // set fermion boundary conditions to be periodic space, antiperiodic time.
    std::string boundary = "1 1 1 -1";
    std::string twist = "0. 0. 0. 0.";
    // sink
    MSink::Point::Par sinkPar;
@@ -80,6 +81,7 @@ int main(int argc, char *argv[])
        actionPar.M5    = 1.8;
        actionPar.mass  = mass[i];
        actionPar.boundary = boundary;
        actionPar.twist = twist;
        application.createModule<MAction::DWF>("DWF_" + flavour[i], actionPar);
        // solvers
@@ -72,6 +72,7 @@ int main(int argc, char *argv[])
    // set fermion boundary conditions to be periodic space, antiperiodic time.
    std::string boundary = "1 1 1 -1";
    std::string twist = "0. 0. 0. 0.";
    for (unsigned int i = 0; i < flavour.size(); ++i)
    {
@@ -82,6 +83,7 @@ int main(int argc, char *argv[])
        actionPar.M5    = 1.8;
        actionPar.mass  = mass[i];
        actionPar.boundary = boundary;
        actionPar.twist = twist;
        application.createModule<MAction::DWF>("DWF_" + flavour[i], actionPar);
        // solvers
@@ -0,0 +1,670 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/solver/Test_multigrid_common.h
    Copyright (C) 2015-2018
    Author: Daniel Richtmann <daniel.richtmann@ur.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_TEST_MULTIGRID_COMMON_H
 #define GRID_TEST_MULTIGRID_COMMON_H
 namespace Grid {
 // TODO: Can think about having one parameter struct per level and then a
 // vector of these structs. How well would that work together with the
 // serialization strategy of Grid?
 // clang-format off
 struct MultiGridParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(MultiGridParams,
                                  int,                           nLevels,
                                  std::vector<std::vector<int>>, blockSizes,           // size == nLevels - 1
                                  std::vector<double>,           smootherTol,          // size == nLevels - 1
                                  std::vector<int>,              smootherMaxOuterIter, // size == nLevels - 1
                                  std::vector<int>,              smootherMaxInnerIter, // size == nLevels - 1
                                  bool,                          kCycle,
                                  std::vector<double>,           kCycleTol,            // size == nLevels - 1
                                  std::vector<int>,              kCycleMaxOuterIter,   // size == nLevels - 1
                                  std::vector<int>,              kCycleMaxInnerIter,   // size == nLevels - 1
                                  double,                        coarseSolverTol,
                                  int,                           coarseSolverMaxOuterIter,
                                  int,                           coarseSolverMaxInnerIter);
  // constructor with default values
  MultiGridParams(int                           _nLevels                  = 2,
                  std::vector<std::vector<int>> _blockSizes               = {{4, 4, 4, 4}},
                  std::vector<double>           _smootherTol              = {1e-14},
                  std::vector<int>              _smootherMaxOuterIter     = {4},
                  std::vector<int>              _smootherMaxInnerIter     = {4},
                  bool                          _kCycle                   = true,
                  std::vector<double>           _kCycleTol                = {1e-1},
                  std::vector<int>              _kCycleMaxOuterIter       = {2},
                  std::vector<int>              _kCycleMaxInnerIter       = {5},
                  double                        _coarseSolverTol          = 5e-2,
                  int                           _coarseSolverMaxOuterIter = 10,
                  int                           _coarseSolverMaxInnerIter = 500)
  : nLevels(_nLevels)
  , blockSizes(_blockSizes)
  , smootherTol(_smootherTol)
  , smootherMaxOuterIter(_smootherMaxOuterIter)
  , smootherMaxInnerIter(_smootherMaxInnerIter)
  , kCycle(_kCycle)
  , kCycleTol(_kCycleTol)
  , kCycleMaxOuterIter(_kCycleMaxOuterIter)
  , kCycleMaxInnerIter(_kCycleMaxInnerIter)
  , coarseSolverTol(_coarseSolverTol)
  , coarseSolverMaxOuterIter(_coarseSolverMaxOuterIter)
  , coarseSolverMaxInnerIter(_coarseSolverMaxInnerIter)
  {}
 };
 // clang-format on
 void checkParameterValidity(MultiGridParams const &params) {
  auto correctSize = params.nLevels - 1;
  assert(correctSize == params.blockSizes.size());
  assert(correctSize == params.smootherTol.size());
  assert(correctSize == params.smootherMaxOuterIter.size());
  assert(correctSize == params.smootherMaxInnerIter.size());
  assert(correctSize == params.kCycleTol.size());
  assert(correctSize == params.kCycleMaxOuterIter.size());
  assert(correctSize == params.kCycleMaxInnerIter.size());
 }
 struct LevelInfo {
 public:
  std::vector<std::vector<int>> Seeds;
  std::vector<GridCartesian *>  Grids;
  std::vector<GridParallelRNG>  PRNGs;
  LevelInfo(GridCartesian *FineGrid, MultiGridParams const &mgParams) {
    auto nCoarseLevels = mgParams.blockSizes.size();
    assert(nCoarseLevels == mgParams.nLevels - 1);
    // set up values for finest grid
    Grids.push_back(FineGrid);
    Seeds.push_back({1, 2, 3, 4});
    PRNGs.push_back(GridParallelRNG(Grids.back()));
    PRNGs.back().SeedFixedIntegers(Seeds.back());
    // set up values for coarser grids
    for(int level = 1; level < mgParams.nLevels; ++level) {
      auto Nd  = Grids[level - 1]->_ndimension;
      auto tmp = Grids[level - 1]->_fdimensions;
      assert(tmp.size() == Nd);
      Seeds.push_back(std::vector<int>(Nd));
      for(int d = 0; d < Nd; ++d) {
        tmp[d] /= mgParams.blockSizes[level - 1][d];
        Seeds[level][d] = (level)*Nd + d + 1;
      }
      Grids.push_back(QCD::SpaceTimeGrid::makeFourDimGrid(tmp, Grids[level - 1]->_simd_layout, GridDefaultMpi()));
      PRNGs.push_back(GridParallelRNG(Grids[level]));
      PRNGs[level].SeedFixedIntegers(Seeds[level]);
    }
    std::cout << GridLogMessage << "Constructed " << mgParams.nLevels << " levels" << std::endl;
    for(int level = 0; level < mgParams.nLevels; ++level) {
      std::cout << GridLogMessage << "level = " << level << ":" << std::endl;
      Grids[level]->show_decomposition();
    }
  }
 };
 template<class Field> class MultiGridPreconditionerBase : public LinearFunction<Field> {
 public:
  virtual ~MultiGridPreconditionerBase()               = default;
  virtual void setup()                                 = 0;
  virtual void operator()(Field const &in, Field &out) = 0;
  virtual void runChecks(RealD tolerance)              = 0;
  virtual void reportTimings()                         = 0;
 };
 template<class Fobj, class CComplex, int nBasis, int nCoarserLevels, class Matrix>
 class MultiGridPreconditioner : public MultiGridPreconditionerBase<Lattice<Fobj>> {
 public:
  /////////////////////////////////////////////
  // Type Definitions
  /////////////////////////////////////////////
  // clang-format off
  typedef Aggregation<Fobj, CComplex, nBasis>                                                                         Aggregates;
  typedef CoarsenedMatrix<Fobj, CComplex, nBasis>                                                                     CoarseDiracMatrix;
  typedef typename Aggregates::CoarseVector                                                                           CoarseVector;
  typedef typename Aggregates::siteVector                                                                             CoarseSiteVector;
  typedef Matrix                                                                                                      FineDiracMatrix;
  typedef typename Aggregates::FineField                                                                              FineVector;
  typedef MultiGridPreconditioner<CoarseSiteVector, iScalar<CComplex>, nBasis, nCoarserLevels - 1, CoarseDiracMatrix> NextPreconditionerLevel;
  // clang-format on
  /////////////////////////////////////////////
  // Member Data
  /////////////////////////////////////////////
  int _CurrentLevel;
  int _NextCoarserLevel;
  MultiGridParams &_MultiGridParams;
  LevelInfo &      _LevelInfo;
  FineDiracMatrix & _FineMatrix;
  FineDiracMatrix & _SmootherMatrix;
  Aggregates        _Aggregates;
  CoarseDiracMatrix _CoarseMatrix;
  std::unique_ptr<NextPreconditionerLevel> _NextPreconditionerLevel;
  GridStopWatch _SetupTotalTimer;
  GridStopWatch _SetupCreateSubspaceTimer;
  GridStopWatch _SetupProjectToChiralitiesTimer;
  GridStopWatch _SetupCoarsenOperatorTimer;
  GridStopWatch _SetupNextLevelTimer;
  GridStopWatch _SolveTotalTimer;
  GridStopWatch _SolveRestrictionTimer;
  GridStopWatch _SolveProlongationTimer;
  GridStopWatch _SolveSmootherTimer;
  GridStopWatch _SolveNextLevelTimer;
  /////////////////////////////////////////////
  // Member Functions
  /////////////////////////////////////////////
  MultiGridPreconditioner(MultiGridParams &mgParams, LevelInfo &LvlInfo, FineDiracMatrix &FineMat, FineDiracMatrix &SmootherMat)
    : _CurrentLevel(mgParams.nLevels - (nCoarserLevels + 1)) // _Level = 0 corresponds to finest
    , _NextCoarserLevel(_CurrentLevel + 1)                   // incremented for instances on coarser levels
    , _MultiGridParams(mgParams)
    , _LevelInfo(LvlInfo)
    , _FineMatrix(FineMat)
    , _SmootherMatrix(SmootherMat)
    , _Aggregates(_LevelInfo.Grids[_NextCoarserLevel], _LevelInfo.Grids[_CurrentLevel], 0)
    , _CoarseMatrix(*_LevelInfo.Grids[_NextCoarserLevel]) {
    _NextPreconditionerLevel
      = std::unique_ptr<NextPreconditionerLevel>(new NextPreconditionerLevel(_MultiGridParams, _LevelInfo, _CoarseMatrix, _CoarseMatrix));
    resetTimers();
  }
  void setup() {
    _SetupTotalTimer.Start();
    static_assert((nBasis & 0x1) == 0, "MG Preconditioner only supports an even number of basis vectors");
    int nb = nBasis / 2;
    MdagMLinearOperator<FineDiracMatrix, FineVector> fineMdagMOp(_FineMatrix);
    _SetupCreateSubspaceTimer.Start();
    _Aggregates.CreateSubspace(_LevelInfo.PRNGs[_CurrentLevel], fineMdagMOp, nb);
    _SetupCreateSubspaceTimer.Stop();
    _SetupProjectToChiralitiesTimer.Start();
    FineVector tmp1(_Aggregates.subspace[0]._grid);
    FineVector tmp2(_Aggregates.subspace[0]._grid);
    for(int n = 0; n < nb; n++) {
      auto tmp1 = _Aggregates.subspace[n];
      G5C(tmp2, _Aggregates.subspace[n]);
      axpby(_Aggregates.subspace[n], 0.5, 0.5, tmp1, tmp2);
      axpby(_Aggregates.subspace[n + nb], 0.5, -0.5, tmp1, tmp2);
      std::cout << GridLogMG << " Level " << _CurrentLevel << ": Chirally doubled vector " << n << ". "
                << "norm2(vec[" << n << "]) = " << norm2(_Aggregates.subspace[n]) << ". "
                << "norm2(vec[" << n + nb << "]) = " << norm2(_Aggregates.subspace[n + nb]) << std::endl;
    }
    _SetupProjectToChiralitiesTimer.Stop();
    _SetupCoarsenOperatorTimer.Start();
    _CoarseMatrix.CoarsenOperator(_LevelInfo.Grids[_CurrentLevel], fineMdagMOp, _Aggregates);
    _SetupCoarsenOperatorTimer.Stop();
    _SetupNextLevelTimer.Start();
    _NextPreconditionerLevel->setup();
    _SetupNextLevelTimer.Stop();
    _SetupTotalTimer.Stop();
  }
  virtual void operator()(FineVector const &in, FineVector &out) {
    conformable(_LevelInfo.Grids[_CurrentLevel], in._grid);
    conformable(in, out);
    // TODO: implement a W-cycle
    if(_MultiGridParams.kCycle)
      kCycle(in, out);
    else
      vCycle(in, out);
  }
  void vCycle(FineVector const &in, FineVector &out) {
    _SolveTotalTimer.Start();
    RealD inputNorm = norm2(in);
    CoarseVector coarseSrc(_LevelInfo.Grids[_NextCoarserLevel]);
    CoarseVector coarseSol(_LevelInfo.Grids[_NextCoarserLevel]);
    coarseSol = zero;
    FineVector fineTmp(in._grid);
    auto maxSmootherIter = _MultiGridParams.smootherMaxOuterIter[_CurrentLevel] * _MultiGridParams.smootherMaxInnerIter[_CurrentLevel];
    TrivialPrecon<FineVector>                      fineTrivialPreconditioner;
    FlexibleGeneralisedMinimalResidual<FineVector> fineFGMRES(_MultiGridParams.smootherTol[_CurrentLevel],
                                                              maxSmootherIter,
                                                              fineTrivialPreconditioner,
                                                              _MultiGridParams.smootherMaxInnerIter[_CurrentLevel],
                                                              false);
    MdagMLinearOperator<FineDiracMatrix, FineVector> fineMdagMOp(_FineMatrix);
    MdagMLinearOperator<FineDiracMatrix, FineVector> fineSmootherMdagMOp(_SmootherMatrix);
    _SolveRestrictionTimer.Start();
    _Aggregates.ProjectToSubspace(coarseSrc, in);
    _SolveRestrictionTimer.Stop();
    _SolveNextLevelTimer.Start();
    (*_NextPreconditionerLevel)(coarseSrc, coarseSol);
    _SolveNextLevelTimer.Stop();
    _SolveProlongationTimer.Start();
    _Aggregates.PromoteFromSubspace(coarseSol, out);
    _SolveProlongationTimer.Stop();
    fineMdagMOp.Op(out, fineTmp);
    fineTmp                                = in - fineTmp;
    auto r                                 = norm2(fineTmp);
    auto residualAfterCoarseGridCorrection = std::sqrt(r / inputNorm);
    _SolveSmootherTimer.Start();
    fineFGMRES(fineSmootherMdagMOp, in, out);
    _SolveSmootherTimer.Stop();
    fineMdagMOp.Op(out, fineTmp);
    fineTmp                        = in - fineTmp;
    r                              = norm2(fineTmp);
    auto residualAfterPostSmoother = std::sqrt(r / inputNorm);
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": V-cycle: Input norm = " << std::sqrt(inputNorm)
              << " Coarse residual = " << residualAfterCoarseGridCorrection << " Post-Smoother residual = " << residualAfterPostSmoother
              << std::endl;
    _SolveTotalTimer.Stop();
  }
  void kCycle(FineVector const &in, FineVector &out) {
    _SolveTotalTimer.Start();
    RealD inputNorm = norm2(in);
    CoarseVector coarseSrc(_LevelInfo.Grids[_NextCoarserLevel]);
    CoarseVector coarseSol(_LevelInfo.Grids[_NextCoarserLevel]);
    coarseSol = zero;
    FineVector fineTmp(in._grid);
    auto smootherMaxIter = _MultiGridParams.smootherMaxOuterIter[_CurrentLevel] * _MultiGridParams.smootherMaxInnerIter[_CurrentLevel];
    auto kCycleMaxIter   = _MultiGridParams.kCycleMaxOuterIter[_CurrentLevel] * _MultiGridParams.kCycleMaxInnerIter[_CurrentLevel];
    TrivialPrecon<FineVector>                        fineTrivialPreconditioner;
    FlexibleGeneralisedMinimalResidual<FineVector>   fineFGMRES(_MultiGridParams.smootherTol[_CurrentLevel],
                                                              smootherMaxIter,
                                                              fineTrivialPreconditioner,
                                                              _MultiGridParams.smootherMaxInnerIter[_CurrentLevel],
                                                              false);
    FlexibleGeneralisedMinimalResidual<CoarseVector> coarseFGMRES(_MultiGridParams.kCycleTol[_CurrentLevel],
                                                                  kCycleMaxIter,
                                                                  *_NextPreconditionerLevel,
                                                                  _MultiGridParams.kCycleMaxInnerIter[_CurrentLevel],
                                                                  false);
    MdagMLinearOperator<FineDiracMatrix, FineVector>     fineMdagMOp(_FineMatrix);
    MdagMLinearOperator<FineDiracMatrix, FineVector>     fineSmootherMdagMOp(_SmootherMatrix);
    MdagMLinearOperator<CoarseDiracMatrix, CoarseVector> coarseMdagMOp(_CoarseMatrix);
    _SolveRestrictionTimer.Start();
    _Aggregates.ProjectToSubspace(coarseSrc, in);
    _SolveRestrictionTimer.Stop();
    _SolveNextLevelTimer.Start();
    coarseFGMRES(coarseMdagMOp, coarseSrc, coarseSol);
    _SolveNextLevelTimer.Stop();
    _SolveProlongationTimer.Start();
    _Aggregates.PromoteFromSubspace(coarseSol, out);
    _SolveProlongationTimer.Stop();
    fineMdagMOp.Op(out, fineTmp);
    fineTmp                                = in - fineTmp;
    auto r                                 = norm2(fineTmp);
    auto residualAfterCoarseGridCorrection = std::sqrt(r / inputNorm);
    _SolveSmootherTimer.Start();
    fineFGMRES(fineSmootherMdagMOp, in, out);
    _SolveSmootherTimer.Stop();
    fineMdagMOp.Op(out, fineTmp);
    fineTmp                        = in - fineTmp;
    r                              = norm2(fineTmp);
    auto residualAfterPostSmoother = std::sqrt(r / inputNorm);
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": K-cycle: Input norm = " << std::sqrt(inputNorm)
              << " Coarse residual = " << residualAfterCoarseGridCorrection << " Post-Smoother residual = " << residualAfterPostSmoother
              << std::endl;
    _SolveTotalTimer.Stop();
  }
  void runChecks(RealD tolerance) {
    std::vector<FineVector>   fineTmps(7, _LevelInfo.Grids[_CurrentLevel]);
    std::vector<CoarseVector> coarseTmps(4, _LevelInfo.Grids[_NextCoarserLevel]);
    MdagMLinearOperator<FineDiracMatrix, FineVector>     fineMdagMOp(_FineMatrix);
    MdagMLinearOperator<CoarseDiracMatrix, CoarseVector> coarseMdagMOp(_CoarseMatrix);
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": **************************************************" << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": MG correctness check: 0 == (M - (Mdiag + Σ_μ Mdir_μ)) * v" << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": **************************************************" << std::endl;
    random(_LevelInfo.PRNGs[_CurrentLevel], fineTmps[0]);
    fineMdagMOp.Op(fineTmps[0], fineTmps[1]);     //     M * v
    fineMdagMOp.OpDiag(fineTmps[0], fineTmps[2]); // Mdiag * v
    fineTmps[4] = zero;
    for(int dir = 0; dir < 4; dir++) { //       Σ_μ Mdir_μ * v
      for(auto disp : {+1, -1}) {
        fineMdagMOp.OpDir(fineTmps[0], fineTmps[3], dir, disp);
        fineTmps[4] = fineTmps[4] + fineTmps[3];
      }
    }
    fineTmps[5] = fineTmps[2] + fineTmps[4]; // (Mdiag + Σ_μ Mdir_μ) * v
    fineTmps[6]    = fineTmps[1] - fineTmps[5];
    auto deviation = std::sqrt(norm2(fineTmps[6]) / norm2(fineTmps[1]));
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": norm2(M * v)                    = " << norm2(fineTmps[1]) << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": norm2(Mdiag * v)                = " << norm2(fineTmps[2]) << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": norm2(Σ_μ Mdir_μ * v)           = " << norm2(fineTmps[4]) << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": norm2((Mdiag + Σ_μ Mdir_μ) * v) = " << norm2(fineTmps[5]) << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": relative deviation              = " << deviation;
    if(deviation > tolerance) {
      std::cout << " > " << tolerance << " -> check failed" << std::endl;
      abort();
    } else {
      std::cout << " < " << tolerance << " -> check passed" << std::endl;
    }
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": **************************************************" << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": MG correctness check: 0 == (1 - P R) v" << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": **************************************************" << std::endl;
    for(auto i = 0; i < _Aggregates.subspace.size(); ++i) {
      _Aggregates.ProjectToSubspace(coarseTmps[0], _Aggregates.subspace[i]); //   R v_i
      _Aggregates.PromoteFromSubspace(coarseTmps[0], fineTmps[0]);           // P R v_i
      fineTmps[1] = _Aggregates.subspace[i] - fineTmps[0]; // v_i - P R v_i
      deviation   = std::sqrt(norm2(fineTmps[1]) / norm2(_Aggregates.subspace[i]));
      std::cout << GridLogMG << " Level " << _CurrentLevel << ": Vector " << i << ": norm2(v_i) = " << norm2(_Aggregates.subspace[i])
                << " | norm2(R v_i) = " << norm2(coarseTmps[0]) << " | norm2(P R v_i) = " << norm2(fineTmps[0])
                << " | relative deviation = " << deviation;
      if(deviation > tolerance) {
        std::cout << " > " << tolerance << " -> check failed" << std::endl;
        abort();
      } else {
        std::cout << " < " << tolerance << " -> check passed" << std::endl;
      }
    }
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": **************************************************" << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": MG correctness check: 0 == (1 - R P) v_c" << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": **************************************************" << std::endl;
    random(_LevelInfo.PRNGs[_NextCoarserLevel], coarseTmps[0]);
    _Aggregates.PromoteFromSubspace(coarseTmps[0], fineTmps[0]); //   P v_c
    _Aggregates.ProjectToSubspace(coarseTmps[1], fineTmps[0]);   // R P v_c
    coarseTmps[2] = coarseTmps[0] - coarseTmps[1]; // v_c - R P v_c
    deviation     = std::sqrt(norm2(coarseTmps[2]) / norm2(coarseTmps[0]));
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": norm2(v_c) = " << norm2(coarseTmps[0])
              << " | norm2(R P v_c) = " << norm2(coarseTmps[1]) << " | norm2(P v_c) = " << norm2(fineTmps[0])
              << " | relative deviation = " << deviation;
    if(deviation > tolerance) {
      std::cout << " > " << tolerance << " -> check failed" << std::endl;
      abort();
    } else {
      std::cout << " < " << tolerance << " -> check passed" << std::endl;
    }
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": **************************************************" << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": MG correctness check: 0 == (R D P - D_c) v_c" << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": **************************************************" << std::endl;
    random(_LevelInfo.PRNGs[_NextCoarserLevel], coarseTmps[0]);
    _Aggregates.PromoteFromSubspace(coarseTmps[0], fineTmps[0]); //     P v_c
    fineMdagMOp.Op(fineTmps[0], fineTmps[1]);                    //   D P v_c
    _Aggregates.ProjectToSubspace(coarseTmps[1], fineTmps[1]);   // R D P v_c
    coarseMdagMOp.Op(coarseTmps[0], coarseTmps[2]); // D_c v_c
    coarseTmps[3] = coarseTmps[1] - coarseTmps[2]; // R D P v_c - D_c v_c
    deviation     = std::sqrt(norm2(coarseTmps[3]) / norm2(coarseTmps[1]));
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": norm2(R D P v_c) = " << norm2(coarseTmps[1])
              << " | norm2(D_c v_c) = " << norm2(coarseTmps[2]) << " | relative deviation = " << deviation;
    if(deviation > tolerance) {
      std::cout << " > " << tolerance << " -> check failed" << std::endl;
      abort();
    } else {
      std::cout << " < " << tolerance << " -> check passed" << std::endl;
    }
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": **************************************************" << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": MG correctness check: 0 == |(Im(v_c^dag D_c^dag D_c v_c)|" << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": **************************************************" << std::endl;
    random(_LevelInfo.PRNGs[_NextCoarserLevel], coarseTmps[0]);
    coarseMdagMOp.Op(coarseTmps[0], coarseTmps[1]);    //         D_c v_c
    coarseMdagMOp.AdjOp(coarseTmps[1], coarseTmps[2]); // D_c^dag D_c v_c
    auto dot  = innerProduct(coarseTmps[0], coarseTmps[2]); //v_c^dag D_c^dag D_c v_c
    deviation = std::abs(imag(dot)) / std::abs(real(dot));
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Re(v_c^dag D_c^dag D_c v_c) = " << real(dot)
              << " | Im(v_c^dag D_c^dag D_c v_c) = " << imag(dot) << " | relative deviation = " << deviation;
    if(deviation > tolerance) {
      std::cout << " > " << tolerance << " -> check failed" << std::endl;
      abort();
    } else {
      std::cout << " < " << tolerance << " -> check passed" << std::endl;
    }
    _NextPreconditionerLevel->runChecks(tolerance);
  }
  void reportTimings() {
    // clang-format off
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Sum   total            " <<                _SetupTotalTimer.Elapsed() + _SolveTotalTimer.Elapsed() << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Setup total            " <<                _SetupTotalTimer.Elapsed() << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Setup create subspace  " <<       _SetupCreateSubspaceTimer.Elapsed() << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Setup project chiral   " << _SetupProjectToChiralitiesTimer.Elapsed() << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Setup coarsen operator " <<      _SetupCoarsenOperatorTimer.Elapsed() << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Setup next level       " <<            _SetupNextLevelTimer.Elapsed() << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Solve total            " <<                _SolveTotalTimer.Elapsed() << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Solve restriction      " <<          _SolveRestrictionTimer.Elapsed() << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Solve prolongation     " <<         _SolveProlongationTimer.Elapsed() << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Solve smoother         " <<             _SolveSmootherTimer.Elapsed() << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Solve next level       " <<            _SolveNextLevelTimer.Elapsed() << std::endl;
    // clang-format on
    _NextPreconditionerLevel->reportTimings();
  }
  void resetTimers() {
    _SetupTotalTimer.Reset();
    _SetupCreateSubspaceTimer.Reset();
    _SetupProjectToChiralitiesTimer.Reset();
    _SetupCoarsenOperatorTimer.Reset();
    _SetupNextLevelTimer.Reset();
    _SolveTotalTimer.Reset();
    _SolveRestrictionTimer.Reset();
    _SolveProlongationTimer.Reset();
    _SolveSmootherTimer.Reset();
    _SolveNextLevelTimer.Reset();
    _NextPreconditionerLevel->resetTimers();
  }
 };
 // Specialization for the coarsest level
 template<class Fobj, class CComplex, int nBasis, class Matrix>
 class MultiGridPreconditioner<Fobj, CComplex, nBasis, 0, Matrix> : public MultiGridPreconditionerBase<Lattice<Fobj>> {
 public:
  /////////////////////////////////////////////
  // Type Definitions
  /////////////////////////////////////////////
  typedef Matrix        FineDiracMatrix;
  typedef Lattice<Fobj> FineVector;
  /////////////////////////////////////////////
  // Member Data
  /////////////////////////////////////////////
  int _CurrentLevel;
  MultiGridParams &_MultiGridParams;
  LevelInfo &      _LevelInfo;
  FineDiracMatrix &_FineMatrix;
  FineDiracMatrix &_SmootherMatrix;
  GridStopWatch _SolveTotalTimer;
  GridStopWatch _SolveSmootherTimer;
  /////////////////////////////////////////////
  // Member Functions
  /////////////////////////////////////////////
  MultiGridPreconditioner(MultiGridParams &mgParams, LevelInfo &LvlInfo, FineDiracMatrix &FineMat, FineDiracMatrix &SmootherMat)
    : _CurrentLevel(mgParams.nLevels - (0 + 1))
    , _MultiGridParams(mgParams)
    , _LevelInfo(LvlInfo)
    , _FineMatrix(FineMat)
    , _SmootherMatrix(SmootherMat) {
    resetTimers();
  }
  void setup() {}
  virtual void operator()(FineVector const &in, FineVector &out) {
    _SolveTotalTimer.Start();
    conformable(_LevelInfo.Grids[_CurrentLevel], in._grid);
    conformable(in, out);
    auto coarseSolverMaxIter = _MultiGridParams.coarseSolverMaxOuterIter * _MultiGridParams.coarseSolverMaxInnerIter;
    // On the coarsest level we only have what I above call the fine level, no coarse one
    TrivialPrecon<FineVector>                      fineTrivialPreconditioner;
    FlexibleGeneralisedMinimalResidual<FineVector> fineFGMRES(
      _MultiGridParams.coarseSolverTol, coarseSolverMaxIter, fineTrivialPreconditioner, _MultiGridParams.coarseSolverMaxInnerIter, false);
    MdagMLinearOperator<FineDiracMatrix, FineVector> fineMdagMOp(_FineMatrix);
    _SolveSmootherTimer.Start();
    fineFGMRES(fineMdagMOp, in, out);
    _SolveSmootherTimer.Stop();
    _SolveTotalTimer.Stop();
  }
  void runChecks(RealD tolerance) {}
  void reportTimings() {
    // clang-format off
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Solve total            " <<    _SolveTotalTimer.Elapsed() << std::endl;
    std::cout << GridLogMG << " Level " << _CurrentLevel << ": Time elapsed: Solve smoother         " << _SolveSmootherTimer.Elapsed() << std::endl;
    // clang-format on
  }
  void resetTimers() {
    _SolveTotalTimer.Reset();
    _SolveSmootherTimer.Reset();
  }
 };
 template<class Fobj, class CComplex, int nBasis, int nLevels, class Matrix>
 using NLevelMGPreconditioner = MultiGridPreconditioner<Fobj, CComplex, nBasis, nLevels - 1, Matrix>;
 template<class Fobj, class CComplex, int nBasis, class Matrix>
 std::unique_ptr<MultiGridPreconditionerBase<Lattice<Fobj>>>
 createMGInstance(MultiGridParams &mgParams, LevelInfo &levelInfo, Matrix &FineMat, Matrix &SmootherMat) {
 #define CASE_FOR_N_LEVELS(nLevels)                                                                                     \
  case nLevels:                                                                                                        \
    return std::unique_ptr<NLevelMGPreconditioner<Fobj, CComplex, nBasis, nLevels, Matrix>>(                           \
      new NLevelMGPreconditioner<Fobj, CComplex, nBasis, nLevels, Matrix>(mgParams, levelInfo, FineMat, SmootherMat)); \
    break;
  switch(mgParams.nLevels) {
    CASE_FOR_N_LEVELS(2);
    CASE_FOR_N_LEVELS(3);
    CASE_FOR_N_LEVELS(4);
    default:
      std::cout << GridLogError << "We currently only support nLevels ∈ {2, 3, 4}" << std::endl;
      exit(EXIT_FAILURE);
      break;
  }
 #undef CASE_FOR_N_LEVELS
 }
 }
 #endif
@@ -0,0 +1,72 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_staggered_cagmres_unprec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  typedef typename ImprovedStaggeredFermionR::FermionField FermionField;
  typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField;
  typename ImprovedStaggeredFermionR::ImplParams params;
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  FermionField src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  FermionField result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass=0.5;
  RealD c1=9.0/8.0;
  RealD c2=-1.0/24.0;
  RealD u0=1.0;
  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
  MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
  CommunicationAvoidingGeneralisedMinimalResidual<FermionField> CAGMRES(1.0e-8, 10000, 25);
  CAGMRES(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,75 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_staggered_fcagmres_prec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  typedef typename ImprovedStaggeredFermionR::FermionField FermionField;
  typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField;
  typename ImprovedStaggeredFermionR::ImplParams params;
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  FermionField src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  FermionField result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass=0.5;
  RealD c1=9.0/8.0;
  RealD c2=-1.0/24.0;
  RealD u0=1.0;
  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
  MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
  TrivialPrecon<FermionField> simple;
  FlexibleCommunicationAvoidingGeneralisedMinimalResidual<FermionField> FCAGMRES(1.0e-8, 10000, simple, 25);
  FCAGMRES(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,75 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_staggered_fgmres_prec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  typedef typename ImprovedStaggeredFermionR::FermionField FermionField;
  typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField;
  typename ImprovedStaggeredFermionR::ImplParams params;
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  FermionField src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  FermionField result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass=0.5;
  RealD c1=9.0/8.0;
  RealD c2=-1.0/24.0;
  RealD u0=1.0;
  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
  MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
  TrivialPrecon<FermionField> simple;
  FlexibleGeneralisedMinimalResidual<FermionField> FGMRES(1.0e-8, 10000, simple, 25);
  FGMRES(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,72 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_staggered_gmres_unprec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  typedef typename ImprovedStaggeredFermionR::FermionField FermionField;
  typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField;
  typename ImprovedStaggeredFermionR::ImplParams params;
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  FermionField src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  FermionField result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass=0.5;
  RealD c1=9.0/8.0;
  RealD c2=-1.0/24.0;
  RealD u0=1.0;
  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
  MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
  GeneralisedMinimalResidual<FermionField> GMRES(1.0e-8, 10000, 25);
  GMRES(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,72 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_staggered_mr_unprec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  typedef typename ImprovedStaggeredFermionR::FermionField FermionField;
  typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField;
  typename ImprovedStaggeredFermionR::ImplParams params;
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  FermionField src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  FermionField result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass=0.5;
  RealD c1=9.0/8.0;
  RealD c2=-1.0/24.0;
  RealD u0=1.0;
  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
  MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
  MinimalResidual<FermionField> MR(1.0e-8,10000,0.8);
  MR(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,65 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_wilson_cagmres_unprec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  LatticeFermion src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  LatticeFermion result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass=0.5;
  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
  MdagMLinearOperator<WilsonFermionR,LatticeFermion> HermOp(Dw);
  CommunicationAvoidingGeneralisedMinimalResidual<LatticeFermion> CAGMRES(1.0e-8, 10000, 25);
  CAGMRES(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,68 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_wilson_fcagmres_prec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  LatticeFermion src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  LatticeFermion result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass=0.5;
  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
  MdagMLinearOperator<WilsonFermionR,LatticeFermion> HermOp(Dw);
  TrivialPrecon<LatticeFermion> simple;
  FlexibleCommunicationAvoidingGeneralisedMinimalResidual<LatticeFermion> FCAGMRES(1.0e-8, 10000, simple, 25);
  FCAGMRES(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,68 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_wilson_fgmres_prec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  LatticeFermion src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  LatticeFermion result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass=0.5;
  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
  MdagMLinearOperator<WilsonFermionR,LatticeFermion> HermOp(Dw);
  TrivialPrecon<LatticeFermion> simple;
  FlexibleGeneralisedMinimalResidual<LatticeFermion> FGMRES(1.0e-8, 10000, simple, 25);
  FGMRES(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,65 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_wilson_gmres_unprec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  LatticeFermion src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  LatticeFermion result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass=0.5;
  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
  MdagMLinearOperator<WilsonFermionR,LatticeFermion> HermOp(Dw);
  GeneralisedMinimalResidual<LatticeFermion> GMRES(1.0e-8, 10000, 25);
  GMRES(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,114 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/solver/Test_wilson_mg.cc
    Copyright (C) 2015-2018
    Author: Daniel Richtmann <daniel.richtmann@ur.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Test_multigrid_common.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  GridCartesian *        FGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
  std::vector<int> fSeeds({1, 2, 3, 4});
  GridParallelRNG  fPRNG(FGrid);
  fPRNG.SeedFixedIntegers(fSeeds);
  // clang-format off
  LatticeFermion    src(FGrid); gaussian(fPRNG, src);
  LatticeFermion result(FGrid); result = zero;
  LatticeGaugeField Umu(FGrid); SU3::HotConfiguration(fPRNG, Umu);
  // clang-format on
  RealD mass = -0.25;
  MultiGridParams mgParams;
  std::string     inputXml{"./mg_params.xml"};
  if(GridCmdOptionExists(argv, argv + argc, "--inputxml")) {
    inputXml = GridCmdOptionPayload(argv, argv + argc, "--inputxml");
    assert(inputXml.length() != 0);
  }
  {
    XmlWriter writer("mg_params_template.xml");
    write(writer, "Params", mgParams);
    std::cout << GridLogMessage << "Written mg_params_template.xml" << std::endl;
    XmlReader reader(inputXml);
    read(reader, "Params", mgParams);
    std::cout << GridLogMessage << "Read in " << inputXml << std::endl;
  }
  checkParameterValidity(mgParams);
  std::cout << mgParams << std::endl;
  LevelInfo levelInfo(FGrid, mgParams);
  // Note: We do chiral doubling, so actually only nbasis/2 full basis vectors are used
  const int nbasis = 40;
  WilsonFermionR Dw(Umu, *FGrid, *FrbGrid, mass);
  MdagMLinearOperator<WilsonFermionR, LatticeFermion> MdagMOpDw(Dw);
  std::cout << GridLogMessage << "**************************************************" << std::endl;
  std::cout << GridLogMessage << "Testing Multigrid for Wilson" << std::endl;
  std::cout << GridLogMessage << "**************************************************" << std::endl;
  TrivialPrecon<LatticeFermion> TrivialPrecon;
  auto MGPreconDw = createMGInstance<vSpinColourVector, vTComplex, nbasis, WilsonFermionR>(mgParams, levelInfo, Dw, Dw);
  MGPreconDw->setup();
  if(GridCmdOptionExists(argv, argv + argc, "--runchecks")) {
    RealD toleranceForMGChecks = (getPrecision<LatticeFermion>::value == 1) ? 1e-6 : 1e-13;
    MGPreconDw->runChecks(toleranceForMGChecks);
  }
  std::vector<std::unique_ptr<OperatorFunction<LatticeFermion>>> solversDw;
  solversDw.emplace_back(new ConjugateGradient<LatticeFermion>(1.0e-12, 50000, false));
  solversDw.emplace_back(new FlexibleGeneralisedMinimalResidual<LatticeFermion>(1.0e-12, 50000, TrivialPrecon, 100, false));
  solversDw.emplace_back(new FlexibleGeneralisedMinimalResidual<LatticeFermion>(1.0e-12, 50000, *MGPreconDw, 100, false));
  for(auto const &solver : solversDw) {
    std::cout << std::endl << "Starting with a new solver" << std::endl;
    result = zero;
    (*solver)(MdagMOpDw, src, result);
  }
  MGPreconDw->reportTimings();
  Grid_finalize();
 }
@@ -0,0 +1,166 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/solver/Test_wilson_mg_mp.cc
    Copyright (C) 2015-2018
    Author: Daniel Richtmann <daniel.richtmann@ur.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Test_multigrid_common.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  // clang-format off
  GridCartesian         *FGrid_d   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi());
  GridCartesian         *FGrid_f   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian *FrbGrid_d = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid_d);
  GridRedBlackCartesian *FrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid_f);
  // clang-format on
  std::vector<int> fSeeds({1, 2, 3, 4});
  GridParallelRNG  fPRNG(FGrid_d);
  fPRNG.SeedFixedIntegers(fSeeds);
  // clang-format off
  LatticeFermionD       src_d(FGrid_d); gaussian(fPRNG, src_d);
  LatticeFermionD resultMGD_d(FGrid_d); resultMGD_d = zero;
  LatticeFermionD resultMGF_d(FGrid_d); resultMGF_d = zero;
  LatticeGaugeFieldD    Umu_d(FGrid_d); SU3::HotConfiguration(fPRNG, Umu_d);
  LatticeGaugeFieldF    Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d);
  // clang-format on
  RealD mass = -0.25;
  MultiGridParams mgParams;
  std::string     inputXml{"./mg_params.xml"};
  if(GridCmdOptionExists(argv, argv + argc, "--inputxml")) {
    inputXml = GridCmdOptionPayload(argv, argv + argc, "--inputxml");
    assert(inputXml.length() != 0);
  }
  {
    XmlWriter writer("mg_params_template.xml");
    write(writer, "Params", mgParams);
    std::cout << GridLogMessage << "Written mg_params_template.xml" << std::endl;
    XmlReader reader(inputXml);
    read(reader, "Params", mgParams);
    std::cout << GridLogMessage << "Read in " << inputXml << std::endl;
  }
  checkParameterValidity(mgParams);
  std::cout << mgParams << std::endl;
  LevelInfo levelInfo_d(FGrid_d, mgParams);
  LevelInfo levelInfo_f(FGrid_f, mgParams);
  // Note: We do chiral doubling, so actually only nbasis/2 full basis vectors are used
  const int nbasis = 40;
  WilsonFermionD Dw_d(Umu_d, *FGrid_d, *FrbGrid_d, mass);
  WilsonFermionF Dw_f(Umu_f, *FGrid_f, *FrbGrid_f, mass);
  MdagMLinearOperator<WilsonFermionD, LatticeFermionD> MdagMOpDw_d(Dw_d);
  MdagMLinearOperator<WilsonFermionF, LatticeFermionF> MdagMOpDw_f(Dw_f);
  std::cout << GridLogMessage << "**************************************************" << std::endl;
  std::cout << GridLogMessage << "Testing single-precision Multigrid for Wilson" << std::endl;
  std::cout << GridLogMessage << "**************************************************" << std::endl;
  auto MGPreconDw_f = createMGInstance<vSpinColourVectorF, vTComplexF, nbasis, WilsonFermionF>(mgParams, levelInfo_f, Dw_f, Dw_f);
  MGPreconDw_f->setup();
  if(GridCmdOptionExists(argv, argv + argc, "--runchecks")) {
    MGPreconDw_f->runChecks(1e-6);
  }
  MixedPrecisionFlexibleGeneralisedMinimalResidual<LatticeFermionD, LatticeFermionF> MPFGMRESPREC(1.0e-12, 50000, FGrid_f, *MGPreconDw_f, 100, false);
  std::cout << std::endl << "Starting with a new solver" << std::endl;
  MPFGMRESPREC(MdagMOpDw_d, src_d, resultMGF_d);
  MGPreconDw_f->reportTimings();
  if(GridCmdOptionExists(argv, argv + argc, "--docomparison")) {
    std::cout << GridLogMessage << "**************************************************" << std::endl;
    std::cout << GridLogMessage << "Testing double-precision Multigrid for Wilson" << std::endl;
    std::cout << GridLogMessage << "**************************************************" << std::endl;
    auto MGPreconDw_d = createMGInstance<vSpinColourVectorD, vTComplexD, nbasis, WilsonFermionD>(mgParams, levelInfo_d, Dw_d, Dw_d);
    MGPreconDw_d->setup();
    if(GridCmdOptionExists(argv, argv + argc, "--runchecks")) {
      MGPreconDw_d->runChecks(1e-13);
    }
    FlexibleGeneralisedMinimalResidual<LatticeFermionD> FGMRESPREC(1.0e-12, 50000, *MGPreconDw_d, 100, false);
    std::cout << std::endl << "Starting with a new solver" << std::endl;
    FGMRESPREC(MdagMOpDw_d, src_d, resultMGD_d);
    MGPreconDw_d->reportTimings();
    std::cout << GridLogMessage << "**************************************************" << std::endl;
    std::cout << GridLogMessage << "Comparing single-precision Multigrid with double-precision one for Wilson" << std::endl;
    std::cout << GridLogMessage << "**************************************************" << std::endl;
    LatticeFermionD diffFullSolver(FGrid_d);
    RealD deviationFullSolver = axpy_norm(diffFullSolver, -1.0, resultMGF_d, resultMGD_d);
    // clang-format off
    LatticeFermionF src_f(FGrid_f);    precisionChange(src_f, src_d);
    LatticeFermionF resMGF_f(FGrid_f); resMGF_f = zero;
    LatticeFermionD resMGD_d(FGrid_d); resMGD_d = zero;
    // clang-format on
    (*MGPreconDw_f)(src_f, resMGF_f);
    (*MGPreconDw_d)(src_d, resMGD_d);
    LatticeFermionD diffOnlyMG(FGrid_d);
    LatticeFermionD resMGF_d(FGrid_d);
    precisionChange(resMGF_d, resMGF_f);
    RealD deviationOnlyPrec = axpy_norm(diffOnlyMG, -1.0, resMGF_d, resMGD_d);
    // clang-format off
    std::cout << GridLogMessage << "Absolute difference between FGMRES preconditioned by double and single precicision MG: " << deviationFullSolver                      << std::endl;
    std::cout << GridLogMessage << "Relative deviation  between FGMRES preconditioned by double and single precicision MG: " << deviationFullSolver / norm2(resultMGD_d) << std::endl;
    std::cout << GridLogMessage << "Absolute difference between one iteration of MG Prec in double and single precision:   " << deviationOnlyPrec                        << std::endl;
    std::cout << GridLogMessage << "Relative deviation  between one iteration of MG Prec in double and single precision:   " << deviationOnlyPrec / norm2(resMGD_d)      << std::endl;
    // clang-format on
  }
  Grid_finalize();
 }
@@ -0,0 +1,65 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_wilson_mr_unprec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  LatticeFermion src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  LatticeFermion result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass=0.5;
  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
  MdagMLinearOperator<WilsonFermionR,LatticeFermion> HermOp(Dw);
  MinimalResidual<LatticeFermion> MR(1.0e-8,10000,0.8);
  MR(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,71 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_wilsonclover_cagmres_unprec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  typedef typename WilsonCloverFermionR::FermionField FermionField;
  typename WilsonCloverFermionR::ImplParams params;
  WilsonAnisotropyCoefficients anis;
  FermionField src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  FermionField result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass  = 0.5;
  RealD csw_r = 1.0;
  RealD csw_t = 1.0;
  WilsonCloverFermionR Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params);
  MdagMLinearOperator<WilsonCloverFermionR,FermionField> HermOp(Dwc);
  CommunicationAvoidingGeneralisedMinimalResidual<FermionField> CAGMRES(1.0e-8, 10000, 25);
  CAGMRES(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,74 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_wilsonclover_fcagmres_prec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  typedef typename WilsonCloverFermionR::FermionField FermionField;
  typename WilsonCloverFermionR::ImplParams params;
  WilsonAnisotropyCoefficients anis;
  FermionField src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  FermionField result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass  = 0.5;
  RealD csw_r = 1.0;
  RealD csw_t = 1.0;
  WilsonCloverFermionR Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params);
  MdagMLinearOperator<WilsonCloverFermionR,FermionField> HermOp(Dwc);
  TrivialPrecon<FermionField> simple;
  FlexibleCommunicationAvoidingGeneralisedMinimalResidual<FermionField> FCAGMRES(1.0e-8, 10000, simple, 25);
  FCAGMRES(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,74 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_wilsonclover_fgmres_prec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  typedef typename WilsonCloverFermionR::FermionField FermionField;
  typename WilsonCloverFermionR::ImplParams params;
  WilsonAnisotropyCoefficients anis;
  FermionField src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  FermionField result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass  = 0.5;
  RealD csw_r = 1.0;
  RealD csw_t = 1.0;
  WilsonCloverFermionR Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params);
  MdagMLinearOperator<WilsonCloverFermionR,FermionField> HermOp(Dwc);
  TrivialPrecon<FermionField> simple;
  FlexibleGeneralisedMinimalResidual<FermionField> FGMRES(1.0e-8, 10000, simple, 25);
  FGMRES(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,71 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_wilsonclover_gmres_unprec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  typedef typename WilsonCloverFermionR::FermionField FermionField;
  typename WilsonCloverFermionR::ImplParams params;
  WilsonAnisotropyCoefficients anis;
  FermionField src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  FermionField result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass  = 0.5;
  RealD csw_r = 1.0;
  RealD csw_t = 1.0;
  WilsonCloverFermionR Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params);
  MdagMLinearOperator<WilsonCloverFermionR,FermionField> HermOp(Dwc);
  GeneralisedMinimalResidual<FermionField> GMRES(1.0e-8, 10000, 25);
  GMRES(HermOp,src,result);
  Grid_finalize();
 }
@@ -0,0 +1,117 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/solver/Test_wilsonclover_mg.cc
    Copyright (C) 2015-2018
    Author: Daniel Richtmann <daniel.richtmann@ur.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Test_multigrid_common.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  GridCartesian *        FGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
  std::vector<int> fSeeds({1, 2, 3, 4});
  GridParallelRNG  fPRNG(FGrid);
  fPRNG.SeedFixedIntegers(fSeeds);
  // clang-format off
  LatticeFermion    src(FGrid); gaussian(fPRNG, src);
  LatticeFermion result(FGrid); result = zero;
  LatticeGaugeField Umu(FGrid); SU3::HotConfiguration(fPRNG, Umu);
  // clang-format on
  RealD mass  = -0.25;
  RealD csw_r = 1.0;
  RealD csw_t = 1.0;
  MultiGridParams mgParams;
  std::string     inputXml{"./mg_params.xml"};
  if(GridCmdOptionExists(argv, argv + argc, "--inputxml")) {
    inputXml = GridCmdOptionPayload(argv, argv + argc, "--inputxml");
    assert(inputXml.length() != 0);
  }
  {
    XmlWriter writer("mg_params_template.xml");
    write(writer, "Params", mgParams);
    std::cout << GridLogMessage << "Written mg_params_template.xml" << std::endl;
    XmlReader reader(inputXml);
    read(reader, "Params", mgParams);
    std::cout << GridLogMessage << "Read in " << inputXml << std::endl;
  }
  checkParameterValidity(mgParams);
  std::cout << mgParams << std::endl;
  LevelInfo levelInfo(FGrid, mgParams);
  // Note: We do chiral doubling, so actually only nbasis/2 full basis vectors are used
  const int nbasis = 40;
  WilsonCloverFermionR Dwc(Umu, *FGrid, *FrbGrid, mass, csw_r, csw_t);
  MdagMLinearOperator<WilsonCloverFermionR, LatticeFermion> MdagMOpDwc(Dwc);
  std::cout << GridLogMessage << "**************************************************" << std::endl;
  std::cout << GridLogMessage << "Testing Multigrid for Wilson Clover" << std::endl;
  std::cout << GridLogMessage << "**************************************************" << std::endl;
  TrivialPrecon<LatticeFermion> TrivialPrecon;
  auto MGPreconDwc = createMGInstance<vSpinColourVector, vTComplex, nbasis, WilsonCloverFermionR>(mgParams, levelInfo, Dwc, Dwc);
  MGPreconDwc->setup();
  if(GridCmdOptionExists(argv, argv + argc, "--runchecks")) {
    RealD toleranceForMGChecks = (getPrecision<LatticeFermion>::value == 1) ? 1e-6 : 1e-13;
    MGPreconDwc->runChecks(toleranceForMGChecks);
  }
  std::vector<std::unique_ptr<OperatorFunction<LatticeFermion>>> solversDwc;
  solversDwc.emplace_back(new ConjugateGradient<LatticeFermion>(1.0e-12, 50000, false));
  solversDwc.emplace_back(new FlexibleGeneralisedMinimalResidual<LatticeFermion>(1.0e-12, 50000, TrivialPrecon, 100, false));
  solversDwc.emplace_back(new FlexibleGeneralisedMinimalResidual<LatticeFermion>(1.0e-12, 50000, *MGPreconDwc, 100, false));
  for(auto const &solver : solversDwc) {
    std::cout << std::endl << "Starting with a new solver" << std::endl;
    result = zero;
    (*solver)(MdagMOpDwc, src, result);
    std::cout << std::endl;
  }
  MGPreconDwc->reportTimings();
  Grid_finalize();
 }
@@ -0,0 +1,169 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/solver/Test_wilsonclover_mg_mp.cc
    Copyright (C) 2015-2018
    Author: Daniel Richtmann <daniel.richtmann@ur.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Test_multigrid_common.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  // clang-format off
  GridCartesian         *FGrid_d   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi());
  GridCartesian         *FGrid_f   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian *FrbGrid_d = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid_d);
  GridRedBlackCartesian *FrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid_f);
  // clang-format on
  std::vector<int> fSeeds({1, 2, 3, 4});
  GridParallelRNG  fPRNG(FGrid_d);
  fPRNG.SeedFixedIntegers(fSeeds);
  // clang-format off
  LatticeFermionD       src_d(FGrid_d); gaussian(fPRNG, src_d);
  LatticeFermionD resultMGD_d(FGrid_d); resultMGD_d = zero;
  LatticeFermionD resultMGF_d(FGrid_d); resultMGF_d = zero;
  LatticeGaugeFieldD    Umu_d(FGrid_d); SU3::HotConfiguration(fPRNG, Umu_d);
  LatticeGaugeFieldF    Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d);
  // clang-format on
  RealD mass  = -0.25;
  RealD csw_r = 1.0;
  RealD csw_t = 1.0;
  MultiGridParams mgParams;
  std::string     inputXml{"./mg_params.xml"};
  if(GridCmdOptionExists(argv, argv + argc, "--inputxml")) {
    inputXml = GridCmdOptionPayload(argv, argv + argc, "--inputxml");
    assert(inputXml.length() != 0);
  }
  {
    XmlWriter writer("mg_params_template.xml");
    write(writer, "Params", mgParams);
    std::cout << GridLogMessage << "Written mg_params_template.xml" << std::endl;
    XmlReader reader(inputXml);
    read(reader, "Params", mgParams);
    std::cout << GridLogMessage << "Read in " << inputXml << std::endl;
  }
  checkParameterValidity(mgParams);
  std::cout << mgParams << std::endl;
  LevelInfo levelInfo_d(FGrid_d, mgParams);
  LevelInfo levelInfo_f(FGrid_f, mgParams);
  // Note: We do chiral doubling, so actually only nbasis/2 full basis vectors are used
  const int nbasis = 40;
  WilsonCloverFermionD Dwc_d(Umu_d, *FGrid_d, *FrbGrid_d, mass, csw_r, csw_t);
  WilsonCloverFermionF Dwc_f(Umu_f, *FGrid_f, *FrbGrid_f, mass, csw_r, csw_t);
  MdagMLinearOperator<WilsonCloverFermionD, LatticeFermionD> MdagMOpDwc_d(Dwc_d);
  MdagMLinearOperator<WilsonCloverFermionF, LatticeFermionF> MdagMOpDwc_f(Dwc_f);
  std::cout << GridLogMessage << "**************************************************" << std::endl;
  std::cout << GridLogMessage << "Testing single-precision Multigrid for Wilson Clover" << std::endl;
  std::cout << GridLogMessage << "**************************************************" << std::endl;
  auto MGPreconDwc_f = createMGInstance<vSpinColourVectorF, vTComplexF, nbasis, WilsonCloverFermionF>(mgParams, levelInfo_f, Dwc_f, Dwc_f);
  MGPreconDwc_f->setup();
  if(GridCmdOptionExists(argv, argv + argc, "--runchecks")) {
    MGPreconDwc_f->runChecks(1e-6);
  }
  MixedPrecisionFlexibleGeneralisedMinimalResidual<LatticeFermionD, LatticeFermionF> MPFGMRESPREC(
    1.0e-12, 50000, FGrid_f, *MGPreconDwc_f, 100, false);
  std::cout << std::endl << "Starting with a new solver" << std::endl;
  MPFGMRESPREC(MdagMOpDwc_d, src_d, resultMGF_d);
  MGPreconDwc_f->reportTimings();
  if(GridCmdOptionExists(argv, argv + argc, "--docomparison")) {
    std::cout << GridLogMessage << "**************************************************" << std::endl;
    std::cout << GridLogMessage << "Testing double-precision Multigrid for Wilson Clover" << std::endl;
    std::cout << GridLogMessage << "**************************************************" << std::endl;
    auto MGPreconDwc_d = createMGInstance<vSpinColourVectorD, vTComplexD, nbasis, WilsonCloverFermionD>(mgParams, levelInfo_d, Dwc_d, Dwc_d);
    MGPreconDwc_d->setup();
    if(GridCmdOptionExists(argv, argv + argc, "--runchecks")) {
      MGPreconDwc_d->runChecks(1e-13);
    }
    FlexibleGeneralisedMinimalResidual<LatticeFermionD> FGMRESPREC(1.0e-12, 50000, *MGPreconDwc_d, 100, false);
    std::cout << std::endl << "Starting with a new solver" << std::endl;
    FGMRESPREC(MdagMOpDwc_d, src_d, resultMGD_d);
    MGPreconDwc_d->reportTimings();
    std::cout << GridLogMessage << "**************************************************" << std::endl;
    std::cout << GridLogMessage << "Comparing single-precision Multigrid with double-precision one for Wilson Clover" << std::endl;
    std::cout << GridLogMessage << "**************************************************" << std::endl;
    LatticeFermionD diffFullSolver(FGrid_d);
    RealD deviationFullSolver = axpy_norm(diffFullSolver, -1.0, resultMGF_d, resultMGD_d);
    // clang-format off
    LatticeFermionF src_f(FGrid_f);    precisionChange(src_f, src_d);
    LatticeFermionF resMGF_f(FGrid_f); resMGF_f = zero;
    LatticeFermionD resMGD_d(FGrid_d); resMGD_d = zero;
    // clang-format on
    (*MGPreconDwc_f)(src_f, resMGF_f);
    (*MGPreconDwc_d)(src_d, resMGD_d);
    LatticeFermionD diffOnlyMG(FGrid_d);
    LatticeFermionD resMGF_d(FGrid_d);
    precisionChange(resMGF_d, resMGF_f);
    RealD deviationOnlyPrec = axpy_norm(diffOnlyMG, -1.0, resMGF_d, resMGD_d);
    // clang-format off
    std::cout << GridLogMessage << "Absolute difference between FGMRES preconditioned by double and single precicision MG: " << deviationFullSolver                      << std::endl;
    std::cout << GridLogMessage << "Relative deviation  between FGMRES preconditioned by double and single precicision MG: " << deviationFullSolver / norm2(resultMGD_d) << std::endl;
    std::cout << GridLogMessage << "Absolute difference between one iteration of MG Prec in double and single precision:   " << deviationOnlyPrec                        << std::endl;
    std::cout << GridLogMessage << "Relative deviation  between one iteration of MG Prec in double and single precision:   " << deviationOnlyPrec / norm2(resMGD_d)      << std::endl;
    // clang-format on
  }
  Grid_finalize();
 }
@@ -0,0 +1,71 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/solver/Test_wilsonclover_mr_unprec.cc
 Copyright (C) 2015-2018
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
  typedef typename WilsonCloverFermionR::FermionField FermionField;
  typename WilsonCloverFermionR::ImplParams params;
  WilsonAnisotropyCoefficients anis;
  FermionField src(&Grid); random(pRNG,src);
  RealD nrm = norm2(src);
  FermionField result(&Grid); result=zero;
  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }
  RealD mass  = 0.5;
  RealD csw_r = 1.0;
  RealD csw_t = 1.0;
  WilsonCloverFermionR Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params);
  MdagMLinearOperator<WilsonCloverFermionR,FermionField> HermOp(Dwc);
  MinimalResidual<FermionField> MR(1.0e-8,10000,0.8);
  MR(HermOp,src,result);
  Grid_finalize();
 }
		`@@ -0,0 +1,3 @@`
							`#include <Grid/GridCore.h>`

							`int Grid::BinaryIO::latticeWriteMaxRetry = -1;`