Check in compact version of wilson clover fermions

2025-04-03 02:35:55 +01:00 · 2022-02-01 21:39:52 +01:00 · 2022-02-01 21:39:52 +01:00 · 3082ab8252
commit 3082ab8252
parent add86cd7f4
4 changed files with 1313 additions and 0 deletions
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
@ -0,0 +1,494 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
+
+    Copyright (C) 2020 - 2022
+
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Nils Meyer <nils.meyer@ur.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
+#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
+//
+// Modifications done here:
+//
+// Original: clover term = 12x12 matrix per site
+//
+// But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
+// Sufficient to store/transfer only the real parts of the diagonal and one triangular part
+// 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
+//
+// Here: Above but diagonal as complex numbers, i.e., need to store/transfer
+// 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
+//
+// Words per site and improvement compared to original (combined with the input and output spinors):
+//
+// - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
+// - Minimal:  2*12 + 36    =  60 words -> 2.80 x less
+// - Here:     2*12 + 42    =  66 words -> 2.55 x less
+//
+// These improvements directly translate to wall-clock time
+//
+// Data layout:
+//
+// - diagonal and triangle part as separate lattice fields,
+//   this was faster than as 1 combined field on all tested machines
+// - diagonal: as expected
+// - triangle: store upper right triangle in row major order
+// - graphical:
+//        0  1  2  3  4
+//           5  6  7  8
+//              9 10 11 = upper right triangle indices
+//                12 13
+//                   14
+//     0
+//        1
+//           2
+//              3       = diagonal indices
+//                 4
+//                    5
+//     0
+//     1  5
+//     2  6  9          = lower left triangle indices
+//     3  7 10 12
+//     4  8 11 13 14
+//
+// Impact on total memory consumption:
+// - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
+// - Here:     (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts =  24 complex words per site
+//           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
+//                                                                 =  84 complex words per site
+
+template<class Impl>
+class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
+                                   public WilsonCloverHelpers<Impl>,
+                                   public CompactWilsonCloverHelpers<Impl> {
+  /////////////////////////////////////////////
+  // Sizes
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_COMPACT_CLOVER_SIZES(Impl);
+
+  /////////////////////////////////////////////
+  // Type definitions
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+  INHERIT_COMPACT_CLOVER_TYPES(Impl);
+
+  typedef WilsonFermion<Impl>              WilsonBase;
+  typedef WilsonCloverHelpers<Impl>        Helpers;
+  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
+
+  /////////////////////////////////////////////
+  // Constructors
+  /////////////////////////////////////////////
+
+public:
+
+  CompactWilsonCloverFermion(GaugeField& _Umu,
+			    GridCartesian& Fgrid,
+			    GridRedBlackCartesian& Hgrid,
+			    const RealD _mass,
+			    const RealD _csw_r = 0.0,
+			    const RealD _csw_t = 0.0,
+			    const RealD _cF = 1.0,
+			    const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
+			    const ImplParams& impl_p = ImplParams())
+    : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
+    , csw_r(_csw_r)
+    , csw_t(_csw_t)
+    , cF(_cF)
+    , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
+    , Diagonal(&Fgrid),        Triangle(&Fgrid)
+    , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
+    , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
+    , DiagonalInv(&Fgrid),     TriangleInv(&Fgrid)
+    , DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
+    , DiagonalInvOdd(&Hgrid),  TriangleInvOdd(&Hgrid)
+    , Tmp(&Fgrid)
+    , BoundaryMask(&Fgrid)
+    , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
+  {
+    csw_r *= 0.5;
+    csw_t *= 0.5;
+    if (clover_anisotropy.isAnisotropic)
+      csw_r /= clover_anisotropy.xi_0;
+
+    ImportGauge(_Umu);
+    if (open_boundaries)
+      CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
+  }
+
+  /////////////////////////////////////////////
+  // Member functions (implementing interface)
+  /////////////////////////////////////////////
+
+public:
+
+  virtual void Instantiatable() {};
+  int          ConstEE()     override { return 0; };
+  int          isTrivialEE() override { return 0; };
+
+
+  void Dhop(const FermionField& in, FermionField& out, int dag) override {
+    WilsonBase::Dhop(in, out, dag);
+    if(open_boundaries) ApplyBoundaryMask(out);
+  }
+
+  void DhopOE(const FermionField& in, FermionField& out, int dag) override {
+    WilsonBase::DhopOE(in, out, dag);
+    if(open_boundaries) ApplyBoundaryMask(out);
+  }
+
+  void DhopEO(const FermionField& in, FermionField& out, int dag) override {
+    WilsonBase::DhopEO(in, out, dag);
+    if(open_boundaries) ApplyBoundaryMask(out);
+  }
+
+  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override {
+    WilsonBase::DhopDir(in, out, dir, disp);
+    if(this->open_boundaries) ApplyBoundaryMask(out);
+  }
+
+  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */ {
+    WilsonBase::DhopDirAll(in, out);
+    if(this->open_boundaries) {
+      for(auto& o : out) ApplyBoundaryMask(o);
+    }
+  }
+
+  void M(const FermionField& in, FermionField& out) override {
+    out.Checkerboard() = in.Checkerboard();
+    WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
+    Mooee(in, Tmp);
+    axpy(out, 1.0, out, Tmp);
+    if(open_boundaries) ApplyBoundaryMask(out);
+  }
+
+  void Mdag(const FermionField& in, FermionField& out) override {
+    out.Checkerboard() = in.Checkerboard();
+    WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
+    MooeeDag(in, Tmp);
+    axpy(out, 1.0, out, Tmp);
+    if(open_boundaries) ApplyBoundaryMask(out);
+  }
+
+  void Meooe(const FermionField& in, FermionField& out) override {
+    WilsonBase::Meooe(in, out);
+    if(open_boundaries) ApplyBoundaryMask(out);
+  }
+
+  void MeooeDag(const FermionField& in, FermionField& out) override {
+    WilsonBase::MeooeDag(in, out);
+    if(open_boundaries) ApplyBoundaryMask(out);
+  }
+
+  void Mooee(const FermionField& in, FermionField& out) override {
+    if(in.Grid()->_isCheckerBoarded) {
+      if(in.Checkerboard() == Odd) {
+        MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
+      } else {
+        MooeeInternal(in, out, DiagonalEven, TriangleEven);
+      }
+    } else {
+      MooeeInternal(in, out, Diagonal, Triangle);
+    }
+    if(open_boundaries) ApplyBoundaryMask(out);
+  }
+
+  void MooeeDag(const FermionField& in, FermionField& out) override {
+    Mooee(in, out); // blocks are hermitian
+  }
+
+  void MooeeInv(const FermionField& in, FermionField& out) override {
+    if(in.Grid()->_isCheckerBoarded) {
+      if(in.Checkerboard() == Odd) {
+        MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
+      } else {
+        MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
+      }
+    } else {
+      MooeeInternal(in, out, DiagonalInv, TriangleInv);
+    }
+    if(open_boundaries) ApplyBoundaryMask(out);
+  }
+
+  void MooeeInvDag(const FermionField& in, FermionField& out) override {
+    MooeeInv(in, out); // blocks are hermitian
+  }
+
+  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override {
+    DhopDir(in, out, dir, disp);
+  }
+
+  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override {
+    DhopDirAll(in, out);
+  }
+
+  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override {
+    assert(!open_boundaries); // TODO check for changes required for open bc
+
+    // NOTE: code copied from original clover term
+    conformable(X.Grid(), Y.Grid());
+    conformable(X.Grid(), force.Grid());
+    GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
+    GaugeField clover_force(force.Grid());
+    PropagatorField Lambda(force.Grid());
+
+    // Guido: Here we are hitting some performance issues:
+    // need to extract the components of the DoubledGaugeField
+    // for each call
+    // Possible solution
+    // Create a vector object to store them? (cons: wasting space)
+    std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
+
+    Impl::extractLinkField(U, this->Umu);
+
+    force = Zero();
+    // Derivative of the Wilson hopping term
+    this->DhopDeriv(force, X, Y, dag);
+
+    ///////////////////////////////////////////////////////////
+    // Clover term derivative
+    ///////////////////////////////////////////////////////////
+    Impl::outerProductImpl(Lambda, X, Y);
+    //std::cout << "Lambda:" << Lambda << std::endl;
+
+    Gamma::Algebra sigma[] = {
+        Gamma::Algebra::SigmaXY,
+        Gamma::Algebra::SigmaXZ,
+        Gamma::Algebra::SigmaXT,
+        Gamma::Algebra::MinusSigmaXY,
+        Gamma::Algebra::SigmaYZ,
+        Gamma::Algebra::SigmaYT,
+        Gamma::Algebra::MinusSigmaXZ,
+        Gamma::Algebra::MinusSigmaYZ,
+        Gamma::Algebra::SigmaZT,
+        Gamma::Algebra::MinusSigmaXT,
+        Gamma::Algebra::MinusSigmaYT,
+        Gamma::Algebra::MinusSigmaZT};
+
+    /*
+      sigma_{\mu \nu}=
+      | 0         sigma[0]  sigma[1]  sigma[2] |
+      | sigma[3]    0       sigma[4]  sigma[5] |
+      | sigma[6]  sigma[7]     0      sigma[8] |
+      | sigma[9]  sigma[10] sigma[11]   0      |
+    */
+
+    int count = 0;
+    clover_force = Zero();
+    for (int mu = 0; mu < 4; mu++)
+    {
+      force_mu = Zero();
+      for (int nu = 0; nu < 4; nu++)
+      {
+        if (mu == nu)
+          continue;
+
+        RealD factor;
+        if (nu == 4 || mu == 4)
+        {
+          factor = 2.0 * csw_t;
+        }
+        else
+        {
+          factor = 2.0 * csw_r;
+        }
+        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+        force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);   // checked
+        count++;
+      }
+
+      pokeLorentz(clover_force, U[mu] * force_mu, mu);
+    }
+    //clover_force *= csw;
+    force += clover_force;
+  }
+
+  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override {
+    assert(0);
+  }
+
+  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override {
+    assert(0);
+  }
+
+  /////////////////////////////////////////////
+  // Member functions (internals)
+  /////////////////////////////////////////////
+
+  void MooeeInternal(const FermionField&        in,
+                     FermionField&              out,
+                     const CloverDiagonalField& diagonal,
+                     const CloverTriangleField& triangle) {
+    assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+    out.Checkerboard() = in.Checkerboard();
+    conformable(in, out);
+    conformable(in, diagonal);
+    conformable(in, triangle);
+
+    CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
+  }
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+  void ImportGauge(const GaugeField& _Umu) override {
+    // NOTE: parts copied from original implementation
+
+    // Import gauge into base class
+    double t0 = usecond();
+    WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
+
+    // Initialize temporary variables
+    double t1 = usecond();
+    conformable(_Umu.Grid(), this->GaugeGrid());
+    GridBase* grid = _Umu.Grid();
+    typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
+    CloverField TmpOriginal(grid);
+
+    // Compute the field strength terms mu>nu
+    double t2 = usecond();
+    WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
+    WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
+    WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
+    WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
+    WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
+    WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
+
+    // Compute the Clover Operator acting on Colour and Spin
+    // multiply here by the clover coefficients for the anisotropy
+    double t3 = usecond();
+    TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
+    TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
+    TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
+    TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
+    TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
+    TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
+    TmpOriginal += this->diag_mass;
+
+    // Convert the data layout of the clover term
+    double t4 = usecond();
+    CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
+
+    // Possible modify the boundary values
+    double t5 = usecond();
+    if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
+
+    // Invert the clover term in the improved layout
+    double t6 = usecond();
+    CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
+
+    // Fill the remaining clover fields
+    double t7 = usecond();
+    pickCheckerboard(Even, DiagonalEven,    Diagonal);
+    pickCheckerboard(Even, TriangleEven,    Triangle);
+    pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
+    pickCheckerboard(Odd,  TriangleOdd,     Triangle);
+    pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
+    pickCheckerboard(Even, TriangleInvEven, TriangleInv);
+    pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
+    pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
+
+    // Report timings
+    double t8 = usecond();
+#if 0
+    std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:"
+              << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
+              << ", allocations = "               << (t2 - t1) / 1e6
+              << ", field strength = "            << (t3 - t2) / 1e6
+              << ", fill clover = "               << (t4 - t3) / 1e6
+              << ", convert = "                   << (t5 - t4) / 1e6
+              << ", boundaries = "                << (t6 - t5) / 1e6
+              << ", inversions = "                << (t7 - t6) / 1e6
+              << ", pick cbs = "                  << (t8 - t7) / 1e6
+              << ", total = "                     << (t8 - t0) / 1e6
+              << std::endl;
+#endif
+  }
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+private:
+
+  template<class Field>
+  const MaskField* getCorrectMaskField(const Field &in) const {
+    if(in.Grid()->_isCheckerBoarded) {
+      if(in.Checkerboard() == Odd) {
+        return &this->BoundaryMaskOdd;
+      } else {
+        return &this->BoundaryMaskEven;
+      }
+    } else {
+      return &this->BoundaryMask;
+    }
+  }
+
+  template<class Field>
+  void ApplyBoundaryMask(Field& f) {
+    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
+    assert(m != nullptr);
+    CompactHelpers::ApplyBoundaryMask(f, *m);
+  }
+
+  /////////////////////////////////////////////
+  // Member Data
+  /////////////////////////////////////////////
+
+public:
+
+  RealD csw_r;
+  RealD csw_t;
+  RealD cF;
+
+  bool open_boundaries;
+
+  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
+  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
+
+  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
+  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
+
+  FermionField Tmp;
+
+  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h
+++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
@ -208,4 +208,554 @@ public:
  }
 };

+
+template<class Impl> class CompactWilsonCloverHelpers {
+public:
+
+  INHERIT_COMPACT_CLOVER_SIZES(Impl);
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+  INHERIT_COMPACT_CLOVER_TYPES(Impl);
+
+  #if 0
+  static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
+    assert(i != j);
+    if(i < j) {
+      return triangle()(block)(triangle_index(i, j));
+    } else { // i > j
+      return conjugate(triangle()(block)(triangle_index(i, j)));
+    }
+  }
+  #else
+  template<typename vobj>
+  static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
+    assert(i != j);
+    if(i < j) {
+      return triangle()(block)(triangle_index(i, j));
+    } else { // i > j
+      return conjugate(triangle()(block)(triangle_index(i, j)));
+    }
+  }
+  #endif
+
+  static accelerator_inline int triangle_index(int i, int j) {
+    if(i == j)
+      return 0;
+    else if(i < j)
+      return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
+    else // i > j
+      return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
+  }
+
+  static void MooeeKernel_gpu(int                        Nsite,
+                              int                        Ls,
+                              const FermionField&        in,
+                              FermionField&              out,
+                              const CloverDiagonalField& diagonal,
+                              const CloverTriangleField& triangle) {
+    autoView(diagonal_v, diagonal, AcceleratorRead);
+    autoView(triangle_v, triangle, AcceleratorRead);
+    autoView(in_v,       in,       AcceleratorRead);
+    autoView(out_v,      out,      AcceleratorWrite);
+
+    typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
+
+    const uint64_t NN = Nsite * Ls;
+
+    accelerator_for(ss, NN, Simd::Nsimd(), {
+      int sF = ss;
+      int sU = ss/Ls;
+      CalcSpinor res;
+      CalcSpinor in_t = in_v(sF);
+      auto diagonal_t = diagonal_v(sU);
+      auto triangle_t = triangle_v(sU);
+      for(int block=0; block<Nhs; block++) {
+        int s_start = block*Nhs;
+        for(int i=0; i<Nred; i++) {
+          int si = s_start + i/Nc, ci = i%Nc;
+          res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
+          for(int j=0; j<Nred; j++) {
+            if (j == i) continue;
+            int sj = s_start + j/Nc, cj = j%Nc;
+            res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
+          };
+        };
+      };
+      coalescedWrite(out_v[sF], res);
+    });
+  }
+
+  static void MooeeKernel_cpu(int                        Nsite,
+                              int                        Ls,
+                              const FermionField&        in,
+                              FermionField&              out,
+                              const CloverDiagonalField& diagonal,
+                              const CloverTriangleField& triangle) {
+    autoView(diagonal_v, diagonal, CpuRead);
+    autoView(triangle_v, triangle, CpuRead);
+    autoView(in_v,       in,       CpuRead);
+    autoView(out_v,      out,      CpuWrite);
+
+    typedef SiteSpinor CalcSpinor;
+
+#if defined(A64FX) || defined(A64FXFIXEDSIZE)
+#define PREFETCH_CLOVER(BASE) {                                     \
+    uint64_t base;                                                  \
+    int pf_dist_L1 = 1;                                             \
+    int pf_dist_L2 = -5; /* -> penalty -> disable */                \
+                                                                    \
+    if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {           \
+      base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);               \
+      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL1STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL1STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL1STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL1STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
+    }                                                               \
+                                                                    \
+    if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {           \
+      base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);               \
+      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL2STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL2STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL2STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL2STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
+    }                                                               \
+  }
+// TODO: Implement/generalize this for other architectures
+// I played around a bit on KNL (see below) but didn't bring anything
+// #elif defined(AVX512)
+// #define PREFETCH_CLOVER(BASE) {                              \
+//     uint64_t base;                                           \
+//     int pf_dist_L1 = 1;                                      \
+//     int pf_dist_L2 = +4;                                     \
+//                                                              \
+//     if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {    \
+//       base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);        \
+//       _mm_prefetch((const char*)(base +    0), _MM_HINT_T0); \
+//       _mm_prefetch((const char*)(base +   64), _MM_HINT_T0); \
+//       _mm_prefetch((const char*)(base +  128), _MM_HINT_T0); \
+//       _mm_prefetch((const char*)(base +  192), _MM_HINT_T0); \
+//       _mm_prefetch((const char*)(base +  256), _MM_HINT_T0); \
+//       _mm_prefetch((const char*)(base +  320), _MM_HINT_T0); \
+//     }                                                        \
+//                                                              \
+//     if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {    \
+//       base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);        \
+//       _mm_prefetch((const char*)(base +    0), _MM_HINT_T1); \
+//       _mm_prefetch((const char*)(base +   64), _MM_HINT_T1); \
+//       _mm_prefetch((const char*)(base +  128), _MM_HINT_T1); \
+//       _mm_prefetch((const char*)(base +  192), _MM_HINT_T1); \
+//       _mm_prefetch((const char*)(base +  256), _MM_HINT_T1); \
+//       _mm_prefetch((const char*)(base +  320), _MM_HINT_T1); \
+//     }                                                        \
+//   }
+#else
+#define PREFETCH_CLOVER(BASE)
+#endif
+
+    const uint64_t NN = Nsite * Ls;
+
+    thread_for(ss, NN, {
+      int sF = ss;
+      int sU = ss/Ls;
+      CalcSpinor res;
+      CalcSpinor in_t = in_v[sF];
+      auto diag_t     = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
+      auto triangle_t = triangle_v[sU];
+
+      // upper half
+      PREFETCH_CLOVER(0);
+
+      auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
+      auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
+      auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
+      auto in_cc_1_0 = conjugate(in_t()(1)(0));
+      auto in_cc_1_1 = conjugate(in_t()(1)(1));
+
+      res()(0)(0) =               diag_t()(0)( 0) * in_t()(0)(0)
+                  +           triangle_t()(0)( 0) * in_t()(0)(1)
+                  +           triangle_t()(0)( 1) * in_t()(0)(2)
+                  +           triangle_t()(0)( 2) * in_t()(1)(0)
+                  +           triangle_t()(0)( 3) * in_t()(1)(1)
+                  +           triangle_t()(0)( 4) * in_t()(1)(2);
+
+      res()(0)(1) =           triangle_t()(0)( 0) * in_cc_0_0;
+      res()(0)(1) =               diag_t()(0)( 1) * in_t()(0)(1)
+                  +           triangle_t()(0)( 5) * in_t()(0)(2)
+                  +           triangle_t()(0)( 6) * in_t()(1)(0)
+                  +           triangle_t()(0)( 7) * in_t()(1)(1)
+                  +           triangle_t()(0)( 8) * in_t()(1)(2)
+                  + conjugate(       res()(0)( 1));
+
+      res()(0)(2) =           triangle_t()(0)( 1) * in_cc_0_0
+                  +           triangle_t()(0)( 5) * in_cc_0_1;
+      res()(0)(2) =               diag_t()(0)( 2) * in_t()(0)(2)
+                  +           triangle_t()(0)( 9) * in_t()(1)(0)
+                  +           triangle_t()(0)(10) * in_t()(1)(1)
+                  +           triangle_t()(0)(11) * in_t()(1)(2)
+                  + conjugate(       res()(0)( 2));
+
+      res()(1)(0) =           triangle_t()(0)( 2) * in_cc_0_0
+                  +           triangle_t()(0)( 6) * in_cc_0_1
+                  +           triangle_t()(0)( 9) * in_cc_0_2;
+      res()(1)(0) =               diag_t()(0)( 3) * in_t()(1)(0)
+                  +           triangle_t()(0)(12) * in_t()(1)(1)
+                  +           triangle_t()(0)(13) * in_t()(1)(2)
+                  + conjugate(       res()(1)( 0));
+
+      res()(1)(1) =           triangle_t()(0)( 3) * in_cc_0_0
+                  +           triangle_t()(0)( 7) * in_cc_0_1
+                  +           triangle_t()(0)(10) * in_cc_0_2
+                  +           triangle_t()(0)(12) * in_cc_1_0;
+      res()(1)(1) =               diag_t()(0)( 4) * in_t()(1)(1)
+                  +           triangle_t()(0)(14) * in_t()(1)(2)
+                  + conjugate(       res()(1)( 1));
+
+      res()(1)(2) =           triangle_t()(0)( 4) * in_cc_0_0
+                  +           triangle_t()(0)( 8) * in_cc_0_1
+                  +           triangle_t()(0)(11) * in_cc_0_2
+                  +           triangle_t()(0)(13) * in_cc_1_0
+                  +           triangle_t()(0)(14) * in_cc_1_1;
+      res()(1)(2) =               diag_t()(0)( 5) * in_t()(1)(2)
+                  + conjugate(       res()(1)( 2));
+
+      vstream(out_v[sF]()(0)(0), res()(0)(0));
+      vstream(out_v[sF]()(0)(1), res()(0)(1));
+      vstream(out_v[sF]()(0)(2), res()(0)(2));
+      vstream(out_v[sF]()(1)(0), res()(1)(0));
+      vstream(out_v[sF]()(1)(1), res()(1)(1));
+      vstream(out_v[sF]()(1)(2), res()(1)(2));
+
+      // lower half
+      PREFETCH_CLOVER(1);
+
+      auto in_cc_2_0 = conjugate(in_t()(2)(0));
+      auto in_cc_2_1 = conjugate(in_t()(2)(1));
+      auto in_cc_2_2 = conjugate(in_t()(2)(2));
+      auto in_cc_3_0 = conjugate(in_t()(3)(0));
+      auto in_cc_3_1 = conjugate(in_t()(3)(1));
+
+      res()(2)(0) =               diag_t()(1)( 0) * in_t()(2)(0)
+                  +           triangle_t()(1)( 0) * in_t()(2)(1)
+                  +           triangle_t()(1)( 1) * in_t()(2)(2)
+                  +           triangle_t()(1)( 2) * in_t()(3)(0)
+                  +           triangle_t()(1)( 3) * in_t()(3)(1)
+                  +           triangle_t()(1)( 4) * in_t()(3)(2);
+
+      res()(2)(1) =           triangle_t()(1)( 0) * in_cc_2_0;
+      res()(2)(1) =               diag_t()(1)( 1) * in_t()(2)(1)
+                  +           triangle_t()(1)( 5) * in_t()(2)(2)
+                  +           triangle_t()(1)( 6) * in_t()(3)(0)
+                  +           triangle_t()(1)( 7) * in_t()(3)(1)
+                  +           triangle_t()(1)( 8) * in_t()(3)(2)
+                  + conjugate(       res()(2)( 1));
+
+      res()(2)(2) =           triangle_t()(1)( 1) * in_cc_2_0
+                  +           triangle_t()(1)( 5) * in_cc_2_1;
+      res()(2)(2) =               diag_t()(1)( 2) * in_t()(2)(2)
+                  +           triangle_t()(1)( 9) * in_t()(3)(0)
+                  +           triangle_t()(1)(10) * in_t()(3)(1)
+                  +           triangle_t()(1)(11) * in_t()(3)(2)
+                  + conjugate(       res()(2)( 2));
+
+      res()(3)(0) =           triangle_t()(1)( 2) * in_cc_2_0
+                  +           triangle_t()(1)( 6) * in_cc_2_1
+                  +           triangle_t()(1)( 9) * in_cc_2_2;
+      res()(3)(0) =               diag_t()(1)( 3) * in_t()(3)(0)
+                  +           triangle_t()(1)(12) * in_t()(3)(1)
+                  +           triangle_t()(1)(13) * in_t()(3)(2)
+                  + conjugate(       res()(3)( 0));
+
+      res()(3)(1) =           triangle_t()(1)( 3) * in_cc_2_0
+                  +           triangle_t()(1)( 7) * in_cc_2_1
+                  +           triangle_t()(1)(10) * in_cc_2_2
+                  +           triangle_t()(1)(12) * in_cc_3_0;
+      res()(3)(1) =               diag_t()(1)( 4) * in_t()(3)(1)
+                  +           triangle_t()(1)(14) * in_t()(3)(2)
+                  + conjugate(       res()(3)( 1));
+
+      res()(3)(2) =           triangle_t()(1)( 4) * in_cc_2_0
+                  +           triangle_t()(1)( 8) * in_cc_2_1
+                  +           triangle_t()(1)(11) * in_cc_2_2
+                  +           triangle_t()(1)(13) * in_cc_3_0
+                  +           triangle_t()(1)(14) * in_cc_3_1;
+      res()(3)(2) =               diag_t()(1)( 5) * in_t()(3)(2)
+                  + conjugate(       res()(3)( 2));
+
+      vstream(out_v[sF]()(2)(0), res()(2)(0));
+      vstream(out_v[sF]()(2)(1), res()(2)(1));
+      vstream(out_v[sF]()(2)(2), res()(2)(2));
+      vstream(out_v[sF]()(3)(0), res()(3)(0));
+      vstream(out_v[sF]()(3)(1), res()(3)(1));
+      vstream(out_v[sF]()(3)(2), res()(3)(2));
+    });
+  }
+
+  static void MooeeKernel(int                        Nsite,
+                          int                        Ls,
+                          const FermionField&        in,
+                          FermionField&              out,
+                          const CloverDiagonalField& diagonal,
+                          const CloverTriangleField& triangle) {
+#if defined(GRID_CUDA) || defined(GRID_HIP)
+    MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
+#else
+    MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
+#endif
+  }
+
+  static void Invert(const CloverDiagonalField& diagonal,
+                     const CloverTriangleField& triangle,
+                     CloverDiagonalField&       diagonalInv,
+                     CloverTriangleField&       triangleInv) {
+    conformable(diagonal, diagonalInv);
+    conformable(triangle, triangleInv);
+    conformable(diagonal, triangle);
+
+    diagonalInv.Checkerboard() = diagonal.Checkerboard();
+    triangleInv.Checkerboard() = triangle.Checkerboard();
+
+    GridBase* grid = diagonal.Grid();
+
+    long lsites = grid->lSites();
+
+    typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
+    typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
+
+    autoView(diagonal_v,  diagonal,  CpuRead);
+    autoView(triangle_v,  triangle,  CpuRead);
+    autoView(diagonalInv_v, diagonalInv, CpuWrite);
+    autoView(triangleInv_v, triangleInv, CpuWrite);
+
+    thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
+      Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
+      Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
+
+      scalar_object_diagonal diagonal_tmp     = Zero();
+      scalar_object_diagonal diagonal_inv_tmp = Zero();
+      scalar_object_triangle triangle_tmp     = Zero();
+      scalar_object_triangle triangle_inv_tmp = Zero();
+
+      Coordinate lcoor;
+      grid->LocalIndexToLocalCoor(site, lcoor);
+
+      peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
+      peekLocalSite(triangle_tmp, triangle_v, lcoor);
+
+      // TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
+      for (long s_row=0;s_row<Ns;s_row++) {
+        for (long s_col=0;s_col<Ns;s_col++) {
+          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
+          int block       = s_row / Nhs;
+          int s_row_block = s_row % Nhs;
+          int s_col_block = s_col % Nhs;
+          for (long c_row=0;c_row<Nc;c_row++) {
+            for (long c_col=0;c_col<Nc;c_col++) {
+              int i = s_row_block * Nc + c_row;
+              int j = s_col_block * Nc + c_col;
+              if(i == j)
+                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
+              else
+                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
+            }
+          }
+        }
+      }
+
+      clover_inv_eigen = clover_eigen.inverse();
+
+      for (long s_row=0;s_row<Ns;s_row++) {
+        for (long s_col=0;s_col<Ns;s_col++) {
+          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
+          int block       = s_row / Nhs;
+          int s_row_block = s_row % Nhs;
+          int s_col_block = s_col % Nhs;
+          for (long c_row=0;c_row<Nc;c_row++) {
+            for (long c_col=0;c_col<Nc;c_col++) {
+              int i = s_row_block * Nc + c_row;
+              int j = s_col_block * Nc + c_col;
+              if(i == j)
+                diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
+              else if(i < j)
+                triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
+              else
+                continue;
+            }
+          }
+        }
+      }
+
+      pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
+      pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
+    });
+  }
+
+  static void ConvertLayout(const CloverField&   full,
+                            CloverDiagonalField& diagonal,
+                            CloverTriangleField& triangle) {
+    conformable(full, diagonal);
+    conformable(full, triangle);
+
+    diagonal.Checkerboard() = full.Checkerboard();
+    triangle.Checkerboard() = full.Checkerboard();
+
+    autoView(full_v,     full,     AcceleratorRead);
+    autoView(diagonal_v, diagonal, AcceleratorWrite);
+    autoView(triangle_v, triangle, AcceleratorWrite);
+
+    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
+    accelerator_for(ss, full.Grid()->oSites(), 1, {
+      for(int s_row = 0; s_row < Ns; s_row++) {
+        for(int s_col = 0; s_col < Ns; s_col++) {
+          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
+          int block       = s_row / Nhs;
+          int s_row_block = s_row % Nhs;
+          int s_col_block = s_col % Nhs;
+          for(int c_row = 0; c_row < Nc; c_row++) {
+            for(int c_col = 0; c_col < Nc; c_col++) {
+              int i = s_row_block * Nc + c_row;
+              int j = s_col_block * Nc + c_col;
+              if(i == j)
+                diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
+              else if(i < j)
+                triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
+              else
+                continue;
+            }
+          }
+        }
+      }
+    });
+  }
+
+
+  static void ConvertLayout(const CloverDiagonalField& diagonal,
+                            const CloverTriangleField& triangle,
+                            CloverField&               full) {
+    conformable(full, diagonal);
+    conformable(full, triangle);
+
+    full.Checkerboard() = diagonal.Checkerboard();
+
+    full = Zero();
+
+    autoView(diagonal_v, diagonal, AcceleratorRead);
+    autoView(triangle_v, triangle, AcceleratorRead);
+    autoView(full_v,     full,     AcceleratorWrite);
+
+    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
+    accelerator_for(ss, full.Grid()->oSites(), 1, {
+      for(int s_row = 0; s_row < Ns; s_row++) {
+        for(int s_col = 0; s_col < Ns; s_col++) {
+          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
+          int block       = s_row / Nhs;
+          int s_row_block = s_row % Nhs;
+          int s_col_block = s_col % Nhs;
+          for(int c_row = 0; c_row < Nc; c_row++) {
+            for(int c_col = 0; c_col < Nc; c_col++) {
+              int i = s_row_block * Nc + c_row;
+              int j = s_col_block * Nc + c_col;
+              if(i == j)
+                full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
+              else
+                full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
+            }
+          }
+        }
+      }
+    });
+  }
+
+  static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
+    // Checks/grid
+    double t0 = usecond();
+    conformable(diagonal, triangle);
+    GridBase* grid = diagonal.Grid();
+
+    // Determine the boundary coordinates/sites
+    double t1 = usecond();
+    int t_dir = Nd - 1;
+    Lattice<iScalar<vInteger>> t_coor(grid);
+    LatticeCoordinate(t_coor, t_dir);
+    int T = grid->GlobalDimensions()[t_dir];
+
+    // Set off-diagonal parts at boundary to zero -- OK
+    double t2 = usecond();
+    CloverTriangleField zeroTriangle(grid);
+    zeroTriangle.Checkerboard() = triangle.Checkerboard();
+    zeroTriangle = Zero();
+    triangle = where(t_coor == 0,   zeroTriangle, triangle);
+    triangle = where(t_coor == T-1, zeroTriangle, triangle);
+
+    // Set diagonal to unity (scaled correctly) -- OK
+    double t3 = usecond();
+    CloverDiagonalField tmp(grid);
+    tmp.Checkerboard() = diagonal.Checkerboard();
+    tmp                = -1.0 * csw_t + diag_mass;
+    diagonal           = where(t_coor == 0,   tmp, diagonal);
+    diagonal           = where(t_coor == T-1, tmp, diagonal);
+
+    // Correct values next to boundary
+    double t4 = usecond();
+    if(cF != 1.0) {
+      tmp = cF - 1.0;
+      tmp += diagonal;
+      diagonal = where(t_coor == 1,   tmp, diagonal);
+      diagonal = where(t_coor == T-2, tmp, diagonal);
+    }
+
+    // Report timings
+    double t5 = usecond();
+#if 0
+    std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
+              << " checks = "          << (t1 - t0) / 1e6
+              << ", coordinate = "     << (t2 - t1) / 1e6
+              << ", off-diag zero = "  << (t3 - t2) / 1e6
+              << ", diagonal unity = " << (t4 - t3) / 1e6
+              << ", near-boundary = "  << (t5 - t4) / 1e6
+              << ", total = "          << (t5 - t0) / 1e6
+              << std::endl;
+#endif
+  }
+
+  template<class Field, class Mask>
+  static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
+    conformable(f, m);
+    auto grid  = f.Grid();
+    const int Nsite = grid->oSites();
+    const int Nsimd = grid->Nsimd();
+    autoView(f_v, f, AcceleratorWrite);
+    autoView(m_v, m, AcceleratorRead);
+    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
+    accelerator_for(ss, Nsite, Nsimd, {
+      coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
+    });
+  }
+
+  template<class MaskField>
+  static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
+    assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
+    assert(odd.Grid()->_isCheckerBoarded  && odd.Checkerboard()  == Odd);
+    assert(!full.Grid()->_isCheckerBoarded);
+
+    GridBase* grid = full.Grid();
+    int t_dir = Nd-1;
+    Lattice<iScalar<vInteger>> t_coor(grid);
+    LatticeCoordinate(t_coor, t_dir);
+    int T = grid->GlobalDimensions()[t_dir];
+
+    MaskField zeroMask(grid); zeroMask = Zero();
+    full = 1.0;
+    full = where(t_coor == 0,   zeroMask, full);
+    full = where(t_coor == T-1, zeroMask, full);
+
+    pickCheckerboard(Even, even, full);
+    pickCheckerboard(Odd,  odd,  full);
+  }
+};
+
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverTypes.h
+++ b/Grid/qcd/action/fermion/WilsonCloverTypes.h
@ -42,8 +42,51 @@ public:
  typedef Lattice<SiteClover> CloverField;
 };

+template<class Impl>
+class CompactWilsonCloverTypes {
+public:
+  INHERIT_IMPL_TYPES(Impl);
+
+  static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
+
+  static constexpr int Nred      = Nc * Nhs;        // 6
+  static constexpr int Nblock    = Nhs;             // 2
+  static constexpr int Ndiagonal = Nred;            // 6
+  static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
+
+  template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
+  template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
+
+  typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
+  typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
+  typedef iSinglet<Simd>            SiteMask;
+
+  typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
+  typedef Lattice<SiteCloverTriangle> CloverTriangleField;
+  typedef Lattice<SiteMask>           MaskField;
+};
+
 #define INHERIT_CLOVER_TYPES(Impl)                                 \
  typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
  typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;

+#define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
+  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal  SiteCloverDiagonal; \
+  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle  SiteCloverTriangle; \
+  typedef typename CompactWilsonCloverTypes<Impl>::SiteMask            SiteMask; \
+  typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
+  typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
+  typedef typename CompactWilsonCloverTypes<Impl>::MaskField           MaskField; \
+  /* ugly duplication but needed inside functionality classes */ \
+  template<typename vtype> using iImplCloverDiagonal = \
+    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
+  template<typename vtype> using iImplCloverTriangle = \
+    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
+
+#define INHERIT_COMPACT_CLOVER_SIZES(Impl)                                    \
+  static constexpr int Nred      = CompactWilsonCloverTypes<Impl>::Nred;      \
+  static constexpr int Nblock    = CompactWilsonCloverTypes<Impl>::Nblock;    \
+  static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
+  static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
+
 NAMESPACE_END(Grid);
--- a/tests/core/Test_compact_wilson_clover_speedup.cc
+++ b/tests/core/Test_compact_wilson_clover_speedup.cc
@ -0,0 +1,226 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./tests/core/Test_compact_wilson_clover_speedup.cc
+
+    Copyright (C) 2020 - 2022
+
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Nils Meyer       <nils.meyer@ur.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+NAMESPACE_BEGIN(CommandlineHelpers);
+
+static bool checkPresent(int* argc, char*** argv, const std::string& option) {
+  return GridCmdOptionExists(*argv, *argv + *argc, option);
+}
+
+static std::string getContent(int* argc, char*** argv, const std::string& option) {
+  return GridCmdOptionPayload(*argv, *argv + *argc, option);
+}
+
+static int readInt(int* argc, char*** argv, std::string&& option, int defaultValue) {
+  std::string arg;
+  int         ret = defaultValue;
+  if(checkPresent(argc, argv, option)) {
+    arg = getContent(argc, argv, option);
+    GridCmdOptionInt(arg, ret);
+  }
+  return ret;
+}
+
+static float readFloat(int* argc, char*** argv, std::string&& option, float defaultValue) {
+  std::string arg;
+  float       ret = defaultValue;
+  if(checkPresent(argc, argv, option)) {
+    arg = getContent(argc, argv, option);
+    GridCmdOptionFloat(arg, ret);
+  }
+  return ret;
+}
+
+NAMESPACE_END(CommandlineHelpers);
+
+
+#define _grid_printf(LOGGER, ...) \
+  { \
+    if((LOGGER).isActive()) { /* this makes it safe to put, e.g., norm2 in the calling code w.r.t. performance */ \
+      char _printf_buf[1024]; \
+      std::sprintf(_printf_buf, __VA_ARGS__); \
+      std::cout << (LOGGER) << _printf_buf; \
+      fflush(stdout); \
+    } \
+  }
+#define grid_printf_msg(...) _grid_printf(GridLogMessage, __VA_ARGS__)
+
+
+template<typename Field>
+bool resultsAgree(const Field& ref, const Field& res, const std::string& name) {
+  RealD checkTolerance = (getPrecision<Field>::value == 2) ? 1e-15 : 1e-7;
+  Field diff(ref.Grid());
+  diff = ref - res;
+  auto absDev = norm2(diff);
+  auto relDev = absDev / norm2(ref);
+  std::cout << GridLogMessage
+            << "norm2(reference), norm2(" << name << "), abs. deviation, rel. deviation: " << norm2(ref) << " "
+            << norm2(res) << " " << absDev << " " << relDev << " -> check "
+            << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
+
+  return relDev <= checkTolerance;
+}
+
+
+template<typename vCoeff_t>
+void runBenchmark(int* argc, char*** argv) {
+  // read from command line
+  const int   nIter        = CommandlineHelpers::readInt(     argc, argv, "--niter", 1000);
+  const RealD mass         = CommandlineHelpers::readFloat(   argc, argv, "--mass",  0.5);
+  const RealD csw          = CommandlineHelpers::readFloat(   argc, argv, "--csw",   1.0);
+  const RealD cF           = CommandlineHelpers::readFloat(   argc, argv, "--cF",    1.0);
+  const bool  antiPeriodic = CommandlineHelpers::checkPresent(argc, argv, "--antiperiodic");
+
+  // precision
+  static_assert(getPrecision<vCoeff_t>::value == 2 || getPrecision<vCoeff_t>::value == 1, "Incorrect precision"); // double or single
+  std::string precision = (getPrecision<vCoeff_t>::value == 2 ? "double" : "single");
+
+  // setup grids
+  GridCartesian*         UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vCoeff_t::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  // clang-format on
+
+  // setup rng
+  std::vector<int> seeds({1, 2, 3, 4});
+  GridParallelRNG  pRNG(UGrid);
+  pRNG.SeedFixedIntegers(seeds);
+
+  // type definitions
+  typedef WilsonImpl<vCoeff_t, FundamentalRepresentation, CoeffReal> WImpl;
+  typedef WilsonCloverFermion<WImpl> WilsonCloverOperator;
+  typedef CompactWilsonCloverFermion<WImpl> CompactWilsonCloverOperator;
+  typedef typename WilsonCloverOperator::FermionField Fermion;
+  typedef typename WilsonCloverOperator::GaugeField Gauge;
+
+  // setup fields
+  Fermion src(UGrid); random(pRNG, src);
+  Fermion ref(UGrid); ref = Zero();
+  Fermion res(UGrid); res = Zero();
+  Fermion hop(UGrid); hop = Zero();
+  Fermion diff(UGrid); diff = Zero();
+  Gauge   Umu(UGrid); SU3::HotConfiguration(pRNG, Umu);
+
+  // setup boundary phases
+  typename WilsonCloverOperator::ImplParams implParams;
+  std::vector<Complex> boundary_phases(Nd, 1.);
+  if(antiPeriodic) boundary_phases[Nd-1] = -1.;
+  implParams.boundary_phases = boundary_phases;
+  WilsonAnisotropyCoefficients anisParams;
+
+  // misc stuff needed for benchmarks
+  double volume=1.0; for(int mu=0; mu<Nd; mu++) volume*=UGrid->_fdimensions[mu];
+
+  // setup fermion operators
+  WilsonCloverOperator        Dwc(        Umu, *UGrid, *UrbGrid, mass, csw, csw,     anisParams, implParams);
+  CompactWilsonCloverOperator Dwc_compact(Umu, *UGrid, *UrbGrid, mass, csw, csw, cF, anisParams, implParams);
+
+  // now test the conversions
+  typename CompactWilsonCloverOperator::CloverField         tmp_ref(UGrid);  tmp_ref  = Dwc.CloverTerm;
+  typename CompactWilsonCloverOperator::CloverField         tmp_res(UGrid);  tmp_res  = Zero();
+  typename CompactWilsonCloverOperator::CloverField         tmp_diff(UGrid); tmp_diff = Zero();
+  typename CompactWilsonCloverOperator::CloverDiagonalField diagonal(UGrid); diagonal = Zero();
+  typename CompactWilsonCloverOperator::CloverTriangleField triangle(UGrid); diagonal = Zero();
+  CompactWilsonCloverOperator::CompactHelpers::ConvertLayout(tmp_ref, diagonal, triangle);
+  CompactWilsonCloverOperator::CompactHelpers::ConvertLayout(diagonal, triangle, tmp_res);
+  tmp_diff = tmp_ref - tmp_res;
+  std::cout << GridLogMessage << "conversion: ref, res, diff, eps"
+            << " " << norm2(tmp_ref)
+            << " " << norm2(tmp_res)
+            << " " << norm2(tmp_diff)
+            << " " << norm2(tmp_diff) / norm2(tmp_ref)
+            << std::endl;
+
+  // performance per site (use minimal values necessary)
+  double hop_flop_per_site            = 1320; // Rich's Talk + what Peter uses
+  double hop_byte_per_site            = (8 * 9 + 9 * 12) * 2 * getPrecision<vCoeff_t>::value * 4;
+  double clov_flop_per_site           = 504; // Rich's Talk and 1412.2629
+  double clov_byte_per_site           = (2 * 18 + 12 + 12) * 2 * getPrecision<vCoeff_t>::value * 4;
+  double clov_flop_per_site_performed = 1128;
+  double clov_byte_per_site_performed = (12 * 12 + 12 + 12) * 2 * getPrecision<vCoeff_t>::value * 4;
+
+  // total performance numbers
+  double hop_gflop_total            = volume * nIter * hop_flop_per_site / 1e9;
+  double hop_gbyte_total            = volume * nIter * hop_byte_per_site / 1e9;
+  double clov_gflop_total           = volume * nIter * clov_flop_per_site / 1e9;
+  double clov_gbyte_total           = volume * nIter * clov_byte_per_site / 1e9;
+  double clov_gflop_performed_total = volume * nIter * clov_flop_per_site_performed / 1e9;
+  double clov_gbyte_performed_total = volume * nIter * clov_byte_per_site_performed / 1e9;
+
+  // warmup + measure dhop
+  for(auto n : {1, 2, 3, 4, 5}) Dwc.Dhop(src, hop, 0);
+  double t0 = usecond();
+  for(int n = 0; n < nIter; n++) Dwc.Dhop(src, hop, 0);
+  double t1 = usecond();
+  double secs_hop = (t1-t0)/1e6;
+  grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n",
+              "hop", precision.c_str(), secs_hop, hop_gflop_total/secs_hop, hop_gbyte_total/secs_hop, 0.0, secs_hop/secs_hop);
+
+#define BENCH_CLOVER_KERNEL(KERNEL) { \
+  /* warmup + measure reference clover */ \
+  for(auto n : {1, 2, 3, 4, 5}) Dwc.KERNEL(src, ref); \
+  double t2 = usecond(); \
+  for(int n = 0; n < nIter; n++) Dwc.KERNEL(src, ref); \
+  double t3 = usecond(); \
+  double secs_ref = (t3-t2)/1e6; \
+  grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", \
+                  "reference_"#KERNEL, precision.c_str(), secs_ref, clov_gflop_total/secs_ref, clov_gbyte_total/secs_ref, secs_ref/secs_ref, secs_ref/secs_hop); \
+  grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", /* to see how well the ET performs */  \
+                  "reference_"#KERNEL"_performed", precision.c_str(), secs_ref, clov_gflop_performed_total/secs_ref, clov_gbyte_performed_total/secs_ref, secs_ref/secs_ref, secs_ref/secs_hop); \
+\
+  /* warmup + measure compact clover */ \
+  for(auto n : {1, 2, 3, 4, 5}) Dwc_compact.KERNEL(src, res); \
+  double t4 = usecond(); \
+  for(int n = 0; n < nIter; n++) Dwc_compact.KERNEL(src, res); \
+  double t5 = usecond(); \
+  double secs_res = (t5-t4)/1e6; \
+  grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", \
+                  "compact_"#KERNEL, precision.c_str(), secs_res, clov_gflop_total/secs_res, clov_gbyte_total/secs_res, secs_ref/secs_res, secs_res/secs_hop); \
+  assert(resultsAgree(ref, res, #KERNEL)); \
+}
+
+  BENCH_CLOVER_KERNEL(Mooee);
+  BENCH_CLOVER_KERNEL(MooeeDag);
+  BENCH_CLOVER_KERNEL(MooeeInv);
+  BENCH_CLOVER_KERNEL(MooeeInvDag);
+
+  grid_printf_msg("finalize %s\n", precision.c_str());
+}
+
+int main(int argc, char** argv) {
+  Grid_init(&argc, &argv);
+
+  runBenchmark<vComplexD>(&argc, &argv);
+  runBenchmark<vComplexF>(&argc, &argv);
+
+  Grid_finalize();
+}