From 873519e96046acfd0844a7d07d540d989a7a6204 Mon Sep 17 00:00:00 2001
From: Michael Marshall <43034299+mmphys@users.noreply.github.com>
Date: Mon, 14 Dec 2020 16:06:10 +0000
Subject: [PATCH 01/16] Enable existing conserved current code for CUDA
 (compiles OK for CUDA 10.1). Add option to Test_cayley_mres to load a
 configuration

---
 .../implementation/CayleyFermion5DImplementation.h |  4 ++--
 tests/debug/Test_cayley_mres.cc                    | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
index b3fbe096..f11e9c44 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
-#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+#if (!defined(GRID_HIP))
   Gamma::Algebra Gmu [] = {
     Gamma::Algebra::GammaX,
     Gamma::Algebra::GammaY,
@@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
   }
 #endif
 
-#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+#if (!defined(GRID_HIP))
   int tshift = (mu == Nd-1) ? 1 : 0;
   ////////////////////////////////////////////////
   // GENERAL CAYLEY CASE
diff --git a/tests/debug/Test_cayley_mres.cc b/tests/debug/Test_cayley_mres.cc
index 2e56fa81..5282c756 100644
--- a/tests/debug/Test_cayley_mres.cc
+++ b/tests/debug/Test_cayley_mres.cc
@@ -108,8 +108,18 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 
   LatticeGaugeField Umu(UGrid);
-  SU<Nc>::ColdConfiguration(Umu);
-  //  SU<Nc>::HotConfiguration(RNG4,Umu);
+  if( argc > 1 && argv[1][0] != '-' )
+  {
+    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
+    FieldMetaData header;
+    NerscIO::readConfiguration(Umu, header, argv[1]);
+  }
+  else
+  {
+    std::cout<<GridLogMessage <<"Using cold configuration"<<std::endl;
+    SU<Nc>::ColdConfiguration(Umu);
+    //  SU<Nc>::HotConfiguration(RNG4,Umu);
+  }
 
   RealD mass=0.3;
   RealD M5  =1.0;

From 4dd9e39e0d465e7cad3aef001dc0edf5e65b0ea6 Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Sat, 19 Dec 2020 00:54:31 +0100
Subject: [PATCH 02/16] up to +36% performance gain for dslash/dwf on QPACE 4
 using GCC 10.1.1

---
 .../implementation/WilsonKernelsAsmA64FX.h    |  268 +-
 .../WilsonKernelsAsmBodyA64FX.h               |  105 +-
 Grid/simd/Fujitsu_A64FX_asm_double.h          |  148 +-
 Grid/simd/Fujitsu_A64FX_asm_single.h          |  148 +-
 Grid/simd/Fujitsu_A64FX_intrin_double.h       |  160 +-
 Grid/simd/Fujitsu_A64FX_intrin_single.h       |  160 +-
 Grid/simd/Fujitsu_A64FX_undef.h               |    1 +
 Grid/simd/gridverter.py                       | 2377 -----------------
 8 files changed, 447 insertions(+), 2920 deletions(-)
 delete mode 100755 Grid/simd/gridverter.py

diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
index 2e587dfa..ffec05a0 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
@@ -38,9 +38,6 @@ Author: Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 // undefine everything related to kernels
 #include <simd/Fujitsu_A64FX_undef.h>
 
-// enable A64FX body
-#define WILSONKERNELSASMBODYA64FX
-//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")
 
     ///////////////////////////////////////////////////////////
     // If we are A64FX specialise the single precision routine
@@ -63,119 +60,89 @@ Author: Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+
 
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+
 
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+
 
 
 /////////////////////////////////////////////////////////////////
@@ -185,119 +152,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+
 
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+
 
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+
 
 
 // undefine
@@ -330,119 +267,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+
 
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+
 
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+
 
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, dag Kernel, double
@@ -451,124 +358,93 @@ WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+
 
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+
 
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
 
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+
 
 
 
 // undefs
-#undef WILSONKERNELSASMBODYA64FX
 #include <simd/Fujitsu_A64FX_undef.h>
 
 #endif //A64FXASM
diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h
index 406e5c25..83588a7d 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h
@@ -25,6 +25,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
     See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
+
+// GCC 10 messes up SVE instruction scheduling using -O3 only,
+// using -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
+// performance is better than armclang 20.2
+
 #ifdef KERNEL_DAG
 #define DIR0_PROJ    XP_PROJ
 #define DIR1_PROJ    YP_PROJ
@@ -97,7 +102,7 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
     PROJ;							                        \
     MAYBEPERM(PERMUTE_DIR,perm);					        \
       } else {								                \
-	LOAD_CHI(base);							                \
+	  LOAD_CHI(base);							                \
       }									                    \
       base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
     MULT_2SPIN_1(Dir);					                    \
@@ -110,6 +115,15 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
     }                                                       \
     RECON;								                    \
 
+/*
+NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
+    though I expected that it would improve on performance
+
+    if (s == 0) {                                           \
+      if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
+    }        \
+*/
+
 #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \
   base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
   PREFETCH1_CHIMU(base);						            \
@@ -126,73 +140,63 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 
 #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
       basep = st.GetPFInfo(nent,plocal); nent++;			\
-      if ( local ) {							            \
-    LOAD_CHIMU(base);                                       \
-    LOAD_TABLE(PERMUTE_DIR);                                \
-    PROJ;							                        \
-    MAYBEPERM(PERMUTE_DIR,perm);					        \
-      }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}	    \
-      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
-      if ( local || st.same_node[Dir] ) {				    \
-    MULT_2SPIN_1(Dir);					                    \
-    PREFETCH_CHIMU(base);                                   \
-    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
-    MULT_2SPIN_2;					                        \
-    if (s == 0) {                                           \
-       if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
-    }                                                       \
-    RECON;								                    \
-    PREFETCH_CHIMU_L2(basep);                               \
-      } else { PREFETCH_CHIMU(base); }								                    \
+      if ( local ) {							\
+  LOAD_CHIMU(base);                                       \
+  LOAD_TABLE(PERMUTE_DIR);                                \
+  PROJ;							                        \
+  MAYBEPERM(PERMUTE_DIR,perm);					        \
+      }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}			\
+      if ( local || st.same_node[Dir] ) {				\
+  MULT_2SPIN_1(Dir);					                    \
+  MULT_2SPIN_2;					                        \
+	RECON;								\
+      }									\
+  base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
+  PREFETCH_CHIMU(base);						\
+  PREFETCH_CHIMU_L2(basep);                               \
 
 #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
   base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
   PREFETCH1_CHIMU(base);						\
+  { ZERO_PSI; }								\
   ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
 
 #define RESULT(base,basep) SAVE_RESULT(base,basep);
 
 #endif
+
 ////////////////////////////////////////////////////////////////////////////////
 // Post comms kernel
 ////////////////////////////////////////////////////////////////////////////////
 #ifdef EXTERIOR
 
-
 #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
-  if((!local)&&(!st.same_node[Dir]) ) {					    \
-    LOAD_CHI(base);							                \
+  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
+  if((!local)&&(!st.same_node[Dir]) ) {					\
+    LOAD_CHI(base);							\
     MULT_2SPIN_1(Dir);					                    \
-    PREFETCH_CHIMU(base);                                   \
-    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
     MULT_2SPIN_2;					                        \
-    if (s == 0) {                                           \
-      if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
-    }                                                       \
-    RECON;								                    \
-    nmu++;								                    \
+    RECON;								\
+    nmu++;								\
   }
 
-#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \
-  nmu=0;								                    \
-  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
-  if((!local)&&(!st.same_node[Dir]) ) {					    \
-    LOAD_CHI(base);							                \
+#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+  nmu=0;								\
+  { ZERO_PSI;}								\
+  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
+  if((!local)&&(!st.same_node[Dir]) ) {					\
+    LOAD_CHI(base);							\
     MULT_2SPIN_1(Dir);					                    \
-    PREFETCH_CHIMU(base);                                   \
-    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
     MULT_2SPIN_2;					                        \
-    if (s == 0) {                                           \
-      if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
-    }                                                       \
-    RECON;								                    \
-    nmu++;								                    \
+    RECON;								\
+    nmu++;								\
   }
 
 #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
 
 #endif
+
+
 {
   int nmu;
   int local,perm, ptype;
@@ -209,7 +213,6 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
     int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
     //    int sUn=lo.Reorder(ssn);
     int sUn=ssn;
-    LOCK_GAUGE(0);
 #else
     int sU =ssU;
     int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
@@ -295,6 +298,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
       std::cout << "----------------------------------------------------" << std::endl;
 #endif
 
+      // DC ZVA test
+      // { uint64_t basestore = (uint64_t)&out[ss];
+      //  PREFETCH_RESULT_L2_STORE(basestore); }
+
+
       ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
 
 #ifdef SHOW
@@ -308,6 +316,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
       std::cout << "----------------------------------------------------" << std::endl;
 #endif
 
+      // DC ZVA test
+      //{ uint64_t basestore = (uint64_t)&out[ss];
+      //  PREFETCH_RESULT_L2_STORE(basestore); }
+
+
       ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
 
 #ifdef SHOW
@@ -321,6 +334,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
       std::cout << "----------------------------------------------------" << std::endl;
 #endif
 
+      // DC ZVA test
+      //{ uint64_t basestore = (uint64_t)&out[ss];
+      //  PREFETCH_RESULT_L2_STORE(basestore);
+      //}
+
       ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
 
 #ifdef SHOW
@@ -341,6 +359,7 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
       base = (uint64_t) &out[ss];
       basep= st.GetPFInfo(nent,plocal); ent++;
       basep = (uint64_t) &out[ssn];
+      //PREFETCH_RESULT_L1_STORE(base);
       RESULT(base,basep);
 
 #ifdef SHOW
diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h
index 76c556d7..bbc4efe7 100644
--- a/Grid/simd/Fujitsu_A64FX_asm_double.h
+++ b/Grid/simd/Fujitsu_A64FX_asm_double.h
@@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #define LOCK_GAUGE(A)  
 #define UNLOCK_GAUGE(A)  
 #define MASK_REGS                      DECLARATIONS_A64FXd  
-#define SAVE_RESULT(A,B)               RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)  
+#define SAVE_RESULT(A,B)               RESULT_A64FXd(A);  
 #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXd(Dir)  
 #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXd  
 #define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)  
+#define ZERO_PSI                       ZERO_PSI_A64FXd  
 #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)  
 #define XP_PROJ                        XP_PROJ_A64FXd  
 #define YP_PROJ                        YP_PROJ_A64FXd  
@@ -70,11 +71,18 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #define MAYBEPERM(Dir,perm)            if (Dir != 3) { if (perm) { PERMUTE; } }  
 // DECLARATIONS
 #define DECLARATIONS_A64FXd  \
+    uint64_t baseU; \
     const uint64_t lut[4][8] = { \
         {4, 5, 6, 7, 0, 1, 2, 3}, \
         {2, 3, 0, 1, 6, 7, 4, 5}, \
         {1, 0, 3, 2, 5, 4, 7, 6}, \
         {0, 1, 2, 4, 5, 6, 7, 8} };\
+asm ( \
+    "ptrue p5.d \n\t" \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); \
 asm ( \
     "fmov z31.d , 0 \n\t" \
     :  \
@@ -130,7 +138,7 @@ asm ( \
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
 { \
-    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
+    const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
 asm ( \
     "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
     "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
@@ -149,7 +157,7 @@ asm ( \
 // PREFETCH_GAUGE_L1 (prefetch to L1)
 #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \
 { \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
 asm ( \
     "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
     "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
@@ -163,12 +171,12 @@ asm ( \
 #define LOAD_CHI_A64FXd(base)  \
 { \
 asm ( \
-    "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \
+    "ld1d { z12.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1d { z13.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1d { z14.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
+    "ld1d { z15.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
+    "ld1d { z16.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
+    "ld1d { z17.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
     :  \
     : [fetchptr] "r" (base) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -178,19 +186,18 @@ asm ( \
 #define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \
 { \
 asm ( \
-    "ptrue p5.d \n\t" \
-    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
+    "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
+    "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
+    "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
+    "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
     :  \
     : [fetchptr] "r" (base + 2 * 3 * 64) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -201,19 +208,18 @@ asm ( \
 { \
     const SiteSpinor & ref(in[offset]); \
 asm ( \
-    "ptrue p5.d \n\t" \
-    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
+    "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
+    "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
+    "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
+    "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
     :  \
     : [fetchptr] "r" (&ref[2][0]) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -224,19 +230,18 @@ asm ( \
 { \
     const SiteSpinor & ref(in[offset]); \
 asm ( \
-    "ptrue p5.d \n\t" \
-    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
+    "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
+    "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
+    "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
+    "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
     :  \
     : [fetchptr] "r" (&ref[2][0]) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -293,17 +298,16 @@ asm ( \
 ); 
 
 // LOAD_GAUGE
-#define LOAD_GAUGE  \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
+#define LOAD_GAUGE(A)  \
 { \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
 asm ( \
-    "ptrue p5.d \n\t" \
-    "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
     :  \
     : [fetchptr] "r" (baseU + 2 * 3 * 64) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -312,14 +316,14 @@ asm ( \
 // MULT_2SPIN
 #define MULT_2SPIN_1_A64FXd(A)  \
 { \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
 asm ( \
-    "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
     "movprfx z18.d, p5/m, z31.d \n\t" \
     "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \
     "movprfx z21.d, p5/m, z31.d \n\t" \
@@ -338,9 +342,9 @@ asm ( \
     "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \
     "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \
     "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \
-    "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
+    "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
     :  \
     : [fetchptr] "r" (baseU + 2 * 3 * 64) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -560,7 +564,6 @@ asm ( \
 #define TM_PROJ_A64FXd  \
 { \
 asm ( \
-    "ptrue p5.d \n\t" \
     "fsub z12.d, p5/m, z12.d, z18.d \n\t" \
     "fsub z13.d, p5/m, z13.d, z19.d \n\t" \
     "fsub z14.d, p5/m, z14.d, z20.d \n\t" \
@@ -715,7 +718,6 @@ asm ( \
 // ZERO_PSI
 #define ZERO_PSI_A64FXd  \
 asm ( \
-    "ptrue p5.d \n\t" \
     "fmov z0.d , 0 \n\t" \
     "fmov z1.d , 0 \n\t" \
     "fmov z2.d , 0 \n\t" \
@@ -733,13 +735,13 @@ asm ( \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 ); 
 
-// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
+// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
 #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base)  \
 { \
 asm ( \
-    "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
+    "dc zva, %[fetchptr]\n\t" \
+    "dc zva, %[fetchptr]\n\t" \
+    "dc zva, %[fetchptr]\n\t" \
     :  \
     : [fetchptr] "r" (base) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h
index d809f83b..e629f617 100644
--- a/Grid/simd/Fujitsu_A64FX_asm_single.h
+++ b/Grid/simd/Fujitsu_A64FX_asm_single.h
@@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #define LOCK_GAUGE(A)  
 #define UNLOCK_GAUGE(A)  
 #define MASK_REGS                      DECLARATIONS_A64FXf  
-#define SAVE_RESULT(A,B)               RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)  
+#define SAVE_RESULT(A,B)               RESULT_A64FXf(A);  
 #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXf(Dir)  
 #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXf  
 #define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)  
+#define ZERO_PSI                       ZERO_PSI_A64FXf  
 #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)  
 #define XP_PROJ                        XP_PROJ_A64FXf  
 #define YP_PROJ                        YP_PROJ_A64FXf  
@@ -70,11 +71,18 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #define MAYBEPERM(A,perm)              if (perm) { PERMUTE; }  
 // DECLARATIONS
 #define DECLARATIONS_A64FXf  \
+    uint64_t baseU; \
     const uint32_t lut[4][16] = { \
         {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
         {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
         {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \
         {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \
+asm ( \
+    "ptrue p5.s \n\t" \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); \
 asm ( \
     "fmov z31.s , 0 \n\t" \
     :  \
@@ -130,7 +138,7 @@ asm ( \
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \
 { \
-    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
+    const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
 asm ( \
     "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
     "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
@@ -149,7 +157,7 @@ asm ( \
 // PREFETCH_GAUGE_L1 (prefetch to L1)
 #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \
 { \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
 asm ( \
     "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
     "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
@@ -163,12 +171,12 @@ asm ( \
 #define LOAD_CHI_A64FXf(base)  \
 { \
 asm ( \
-    "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \
+    "ld1w { z12.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1w { z13.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1w { z14.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
+    "ld1w { z15.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
+    "ld1w { z16.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
+    "ld1w { z17.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
     :  \
     : [fetchptr] "r" (base) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -178,19 +186,18 @@ asm ( \
 #define LOAD_CHIMU_INTERLEAVED_A64FXf(base)  \
 { \
 asm ( \
-    "ptrue p5.s \n\t" \
-    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
+    "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
+    "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
+    "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
+    "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
     :  \
     : [fetchptr] "r" (base + 2 * 3 * 64) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -201,19 +208,18 @@ asm ( \
 { \
     const SiteSpinor & ref(in[offset]); \
 asm ( \
-    "ptrue p5.s \n\t" \
-    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
+    "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
+    "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
+    "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
+    "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
     :  \
     : [fetchptr] "r" (&ref[2][0]) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -224,19 +230,18 @@ asm ( \
 { \
     const SiteSpinor & ref(in[offset]); \
 asm ( \
-    "ptrue p5.s \n\t" \
-    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
+    "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
+    "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
+    "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
+    "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
     :  \
     : [fetchptr] "r" (&ref[2][0]) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -293,17 +298,16 @@ asm ( \
 ); 
 
 // LOAD_GAUGE
-#define LOAD_GAUGE  \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
+#define LOAD_GAUGE(A)  \
 { \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
 asm ( \
-    "ptrue p5.s \n\t" \
-    "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
     :  \
     : [fetchptr] "r" (baseU + 2 * 3 * 64) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -312,14 +316,14 @@ asm ( \
 // MULT_2SPIN
 #define MULT_2SPIN_1_A64FXf(A)  \
 { \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
 asm ( \
-    "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
     "movprfx z18.s, p5/m, z31.s \n\t" \
     "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \
     "movprfx z21.s, p5/m, z31.s \n\t" \
@@ -338,9 +342,9 @@ asm ( \
     "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \
     "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \
     "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \
-    "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
+    "ld1w { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1w { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
     :  \
     : [fetchptr] "r" (baseU + 2 * 3 * 64) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -560,7 +564,6 @@ asm ( \
 #define TM_PROJ_A64FXf  \
 { \
 asm ( \
-    "ptrue p5.s \n\t" \
     "fsub z12.s, p5/m, z12.s, z18.s \n\t" \
     "fsub z13.s, p5/m, z13.s, z19.s \n\t" \
     "fsub z14.s, p5/m, z14.s, z20.s \n\t" \
@@ -715,7 +718,6 @@ asm ( \
 // ZERO_PSI
 #define ZERO_PSI_A64FXf  \
 asm ( \
-    "ptrue p5.s \n\t" \
     "fmov z0.s , 0 \n\t" \
     "fmov z1.s , 0 \n\t" \
     "fmov z2.s , 0 \n\t" \
@@ -733,13 +735,13 @@ asm ( \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
 ); 
 
-// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
+// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
 #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base)  \
 { \
 asm ( \
-    "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
+    "dc zva, %[fetchptr]\n\t" \
+    "dc zva, %[fetchptr]\n\t" \
+    "dc zva, %[fetchptr]\n\t" \
     :  \
     : [fetchptr] "r" (base) \
     : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h
index 232610f2..361246fc 100644
--- a/Grid/simd/Fujitsu_A64FX_intrin_double.h
+++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h
@@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #define LOCK_GAUGE(A)  
 #define UNLOCK_GAUGE(A)  
 #define MASK_REGS                      DECLARATIONS_A64FXd  
-#define SAVE_RESULT(A,B)               RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)  
+#define SAVE_RESULT(A,B)               RESULT_A64FXd(A);  
 #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXd(Dir)  
 #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXd  
 #define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)  
+#define ZERO_PSI                       ZERO_PSI_A64FXd  
 #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)  
 #define XP_PROJ                        XP_PROJ_A64FXd  
 #define YP_PROJ                        YP_PROJ_A64FXd  
@@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #define MAYBEPERM(Dir,perm)            if (Dir != 3) { if (perm) { PERMUTE; } }  
 // DECLARATIONS
 #define DECLARATIONS_A64FXd  \
+    uint64_t baseU; \
     const uint64_t lut[4][8] = { \
         {4, 5, 6, 7, 0, 1, 2, 3}, \
         {2, 3, 0, 1, 6, 7, 4, 5}, \
@@ -126,18 +128,18 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // RESULT
 #define RESULT_A64FXd(base)  \
 { \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32);  \
 }
 // PREFETCH_CHIMU_L2 (prefetch to L2)
 #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \
@@ -156,7 +158,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
 { \
-    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
+    const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
     svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
     svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
     svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
@@ -170,7 +172,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // PREFETCH_GAUGE_L1 (prefetch to L1)
 #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \
 { \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
     svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
     svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
     svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
@@ -178,62 +180,62 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // LOAD_CHI
 #define LOAD_CHI_A64FXd(base)  \
 { \
-    Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64));  \
-    Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64));  \
-    Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64));  \
-    Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64));  \
-    Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64));  \
-    Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64));  \
+    Chi_00 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(0));  \
+    Chi_01 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(1));  \
+    Chi_02 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(2));  \
+    Chi_10 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(3));  \
+    Chi_11 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(4));  \
+    Chi_12 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(5));  \
 }
 // LOAD_CHIMU
 #define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \
 { \
-    Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
-    Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \
-    Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
-    Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \
-    Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
-    Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \
-    Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
-    Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \
-    Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
-    Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \
-    Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
-    Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \
+    Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
+    Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
+    Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
+    Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
+    Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
+    Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
+    Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
+    Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
+    Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
+    Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
+    Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
+    Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 }
 // LOAD_CHIMU_0213
 #define LOAD_CHIMU_0213_A64FXd  \
 { \
     const SiteSpinor & ref(in[offset]); \
-    Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
-    Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \
-    Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
-    Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \
-    Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
-    Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \
-    Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
-    Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \
-    Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
-    Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \
-    Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
-    Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \
+    Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
+    Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
+    Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
+    Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
+    Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
+    Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
+    Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
+    Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
+    Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
+    Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
+    Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
+    Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
 }
 // LOAD_CHIMU_0312
 #define LOAD_CHIMU_0312_A64FXd  \
 { \
     const SiteSpinor & ref(in[offset]); \
-    Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
-    Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \
-    Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
-    Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \
-    Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
-    Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \
-    Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
-    Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \
-    Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
-    Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \
-    Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
-    Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \
+    Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
+    Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
+    Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
+    Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
+    Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
+    Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
+    Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
+    Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
+    Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
+    Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
+    Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
+    Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 }
 // LOAD_TABLE0
 #define LOAD_TABLE0  \
@@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de>
     Chi_12 = svtbl(Chi_12, table0);    
 
 // LOAD_GAUGE
-#define LOAD_GAUGE  \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
+#define LOAD_GAUGE(A)  \
 { \
-    U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \
-    U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \
-    U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \
-    U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \
-    U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \
-    U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
+    U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \
+    U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \
+    U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \
+    U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \
+    U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \
+    U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \
 }
 // MULT_2SPIN
 #define MULT_2SPIN_1_A64FXd(A)  \
 { \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
-    U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \
-    U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \
-    U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \
-    U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \
-    U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \
-    U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
+    U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \
+    U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \
+    U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \
+    U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \
+    U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \
+    U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \
     UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
     UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
     UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
@@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
     UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
     UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
     UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
-    U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64));  \
-    U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64));  \
-    U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64));  \
+    U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-4));  \
+    U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-1));  \
+    U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(2));  \
 }
 // MULT_2SPIN_BACKEND
 #define MULT_2SPIN_2_A64FXd  \
@@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
     result_31 = svdup_f64(0.); \
     result_32 = svdup_f64(0.); 
 
-// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
+// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
 #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base)  \
 { \
-    svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
+    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \
+    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \
+    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \
 }
 // PREFETCH_RESULT_L1_STORE (prefetch store to L1)
 #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base)  \
diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h
index 180e5f4f..30273b6e 100644
--- a/Grid/simd/Fujitsu_A64FX_intrin_single.h
+++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h
@@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #define LOCK_GAUGE(A)  
 #define UNLOCK_GAUGE(A)  
 #define MASK_REGS                      DECLARATIONS_A64FXf  
-#define SAVE_RESULT(A,B)               RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)  
+#define SAVE_RESULT(A,B)               RESULT_A64FXf(A);  
 #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXf(Dir)  
 #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXf  
 #define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)  
+#define ZERO_PSI                       ZERO_PSI_A64FXf  
 #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)  
 #define XP_PROJ                        XP_PROJ_A64FXf  
 #define YP_PROJ                        YP_PROJ_A64FXf  
@@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #define MAYBEPERM(A,perm)              if (perm) { PERMUTE; }  
 // DECLARATIONS
 #define DECLARATIONS_A64FXf  \
+    uint64_t baseU; \
     const uint32_t lut[4][16] = { \
         {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
         {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
@@ -126,18 +128,18 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // RESULT
 #define RESULT_A64FXf(base)  \
 { \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32);  \
 }
 // PREFETCH_CHIMU_L2 (prefetch to L2)
 #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base)  \
@@ -156,7 +158,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \
 { \
-    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
+    const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
     svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
     svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
     svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
@@ -170,7 +172,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // PREFETCH_GAUGE_L1 (prefetch to L1)
 #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \
 { \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
     svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
     svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
     svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
@@ -178,62 +180,62 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // LOAD_CHI
 #define LOAD_CHI_A64FXf(base)  \
 { \
-    Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64));  \
-    Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64));  \
-    Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64));  \
-    Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64));  \
-    Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64));  \
-    Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64));  \
+    Chi_00 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(0));  \
+    Chi_01 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(1));  \
+    Chi_02 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(2));  \
+    Chi_10 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(3));  \
+    Chi_11 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(4));  \
+    Chi_12 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(5));  \
 }
 // LOAD_CHIMU
 #define LOAD_CHIMU_INTERLEAVED_A64FXf(base)  \
 { \
-    Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \
-    Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \
-    Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \
-    Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \
-    Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \
-    Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \
-    Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \
-    Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \
-    Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \
-    Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \
-    Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \
-    Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \
+    Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
+    Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
+    Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
+    Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
+    Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
+    Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
+    Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
+    Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
+    Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
+    Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
+    Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
+    Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 }
 // LOAD_CHIMU_0213
 #define LOAD_CHIMU_0213_A64FXf  \
 { \
     const SiteSpinor & ref(in[offset]); \
-    Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \
-    Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \
-    Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \
-    Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \
-    Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \
-    Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \
-    Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \
-    Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \
-    Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \
-    Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \
-    Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \
-    Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \
+    Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
+    Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
+    Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
+    Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
+    Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
+    Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
+    Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
+    Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
+    Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
+    Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
+    Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
+    Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
 }
 // LOAD_CHIMU_0312
 #define LOAD_CHIMU_0312_A64FXf  \
 { \
     const SiteSpinor & ref(in[offset]); \
-    Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \
-    Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \
-    Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \
-    Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \
-    Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \
-    Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \
-    Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \
-    Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \
-    Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \
-    Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \
-    Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \
-    Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \
+    Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
+    Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
+    Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
+    Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
+    Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
+    Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
+    Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
+    Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
+    Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
+    Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
+    Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
+    Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 }
 // LOAD_TABLE0
 #define LOAD_TABLE0  \
@@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de>
     Chi_12 = svtbl(Chi_12, table0);    
 
 // LOAD_GAUGE
-#define LOAD_GAUGE  \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
+#define LOAD_GAUGE(A)  \
 { \
-    U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \
-    U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \
-    U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \
-    U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \
-    U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \
-    U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
+    U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \
+    U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \
+    U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \
+    U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \
+    U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \
+    U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \
 }
 // MULT_2SPIN
 #define MULT_2SPIN_1_A64FXf(A)  \
 { \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
-    U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \
-    U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \
-    U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \
-    U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \
-    U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \
-    U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
+    U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \
+    U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \
+    U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \
+    U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \
+    U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \
+    U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \
     UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
     UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
     UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
@@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
     UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
     UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
     UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
-    U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64));  \
-    U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64));  \
-    U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64));  \
+    U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-4));  \
+    U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-1));  \
+    U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(2));  \
 }
 // MULT_2SPIN_BACKEND
 #define MULT_2SPIN_2_A64FXf  \
@@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
     result_31 = svdup_f32(0.); \
     result_32 = svdup_f32(0.); 
 
-// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
+// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
 #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base)  \
 { \
-    svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
+    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \
+    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \
+    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \
 }
 // PREFETCH_RESULT_L1_STORE (prefetch store to L1)
 #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base)  \
diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h
index 81eec37a..51762a60 100644
--- a/Grid/simd/Fujitsu_A64FX_undef.h
+++ b/Grid/simd/Fujitsu_A64FX_undef.h
@@ -46,6 +46,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #undef MULT_2SPIN_2
 #undef MAYBEPERM
 #undef LOAD_CHI
+#undef ZERO_PSI
 #undef XP_PROJ
 #undef YP_PROJ
 #undef ZP_PROJ
diff --git a/Grid/simd/gridverter.py b/Grid/simd/gridverter.py
deleted file mode 100755
index f00a5019..00000000
--- a/Grid/simd/gridverter.py
+++ /dev/null
@@ -1,2377 +0,0 @@
-#!/usr/bin/python3
-
-import re
-import argparse
-import sys
-
-# Grid for A64FX
-#
-# * should align std::vector to (multiples of) cache block size = 256 bytes
-
-# place benchmark runtime in cycles here !
-measured_cycles = 690 #1500 #775 #1500
-
-
-# command line parser
-parser = argparse.ArgumentParser(description="Dslash generator.")
-parser.add_argument("--single", action="store_true", default="False")
-parser.add_argument("--double", action="store_true", default="True")
-parser.add_argument("--debug", action="store_true", default="False")
-parser.add_argument("--gridbench", action="store_true", default="False")
-args = parser.parse_args()
-
-print(args)
-
-ASM_LOAD_CHIMU = True       # load chimu
-ASM_LOAD_GAUGE = True       # load gauge
-ASM_LOAD_TABLE = True       # load table
-ASM_STORE = True            # store result
-
-# Disable all loads and stores in asm for benchmarking purposes
-#DISABLE_ASM_LOAD_STORE = True
-DISABLE_ASM_LOAD_STORE = False
-
-if DISABLE_ASM_LOAD_STORE:
-    ASM_LOAD_CHIMU = True       # load chimu
-    ASM_LOAD_GAUGE = True       # load gauge
-    ASM_LOAD_TABLE = True       # load table
-    ASM_STORE = False            # store result
-
-# Alternative implementation using PROJ specific loads works,
-# but be careful with predication
-
-ALTERNATIVE_LOADS = False
-#ALTERNATIVE_LOADS = not ALTERNATIVE_LOADS      # True
-
-# Alternative register mapping,
-# must use with my_wilson4.h and my_wilson4pf.h
-
-ALTERNATIVE_REGISTER_MAPPING = False
-#ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING
-
-if ALTERNATIVE_REGISTER_MAPPING == True:
-    ALTERNATIVE_LOADS = False
-
-# use movprfx
-MOVPRFX = False
-MOVPRFX = not MOVPRFX
-
-
-PREFETCH = False
-PREFETCH = not PREFETCH # True
-
-PRECISION = 'double'   # DP by default
-PRECSUFFIX = 'A64FXd'
-if args.single == True:
-    PRECISION = 'single'
-    PRECSUFFIX = 'A64FXf'
-
-_DEBUG = False #True       # insert debugging output
-if args.debug == True:
-    _DEBUG = True
-
-GRIDBENCH = False
-if args.gridbench == True:
-    GRIDBENCH = True
-
-print("PRECISION                    = ", PRECISION)
-print("DEBUG                        = ", _DEBUG)
-print("ALTERNATIVE_LOADS            = ", ALTERNATIVE_LOADS)
-print("ALTERNATIVE_REGISTER_MAPPING = ", ALTERNATIVE_REGISTER_MAPPING)
-print("MOVPRFX                      = ", MOVPRFX)
-print("DISABLE_ASM_LOAD_STORE       = ", DISABLE_ASM_LOAD_STORE)
-print("GRIDBENCH                    = ", GRIDBENCH)
-
-print("")
-
-#sys.exit(0)
-
-
-#_DEBUG = True       # insert debugging output
-
-FETCH_BASE_PTR_COLOR_OFFSET = 2  # offset for scalar plus signed immediate addressing
-STORE_BASE_PTR_COLOR_OFFSET = 2
-
-# 64-bit gp register usage !!! armclang 20.0 complains about the register choice !!!
-# table address: x30
-# data address:  x29
-# store address: x28
-# debug address: r8
-
-# Max performance of complex FMA using FCMLA instruction
-# is 25% peak.
-#
-# Issue latency of FCMLA is 2 cycles.
-# Need 2 FCMLA instructions for complex FMA.
-# Complete complex FMA takes 4 cycles.
-# Peak throughput is 4 * 8 Flops DP = 32 Flops DP in 4 cycles.
-# A64FX FMA throughput is 4 * 8 * 2 * 2 = 132 Flops DP in 4 cycles.
-# -> 25% peak FMA
-#
-# In:  3x 512 bits = 192 bytes
-# Out: 1x 512 bits = 64 bytes
-# Tot: 4x 512 bits = 256 bytes
-#
-# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2)
-
-OPT = """
-* interleave prefetching and compute in MULT_2SPIN
-* could test storing U's in MULT_2SPIN to L1d for cache line update
-* structure reordering: MAYBEPERM after MULT_2SPIN ?
-"""
-
-filename = 'XXX'
-LEGAL = """/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: {}
-
-    Copyright (C) 2020
-
-Author: Nils Meyer <nils.meyer@ur.de>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-"""
-
-class Register:
-
-    def __init__(self, variable, asmreg='X', predication=False):
-        global d
-        x = 'Y'
-        if predication == False:
-            x = asmreg # + d['asmsuffix']
-        else:
-            x = asmreg
-        self.asmreg = x
-        self.asmregwithsuffix = asmreg + d['asmsuffix']
-        self.asmregbyte = asmreg + '.b'
-        self.name = variable
-        self.asmname = variable
-        self.asmnamebyte = variable + '.b'
-        self.predication = predication
-
-        d['registers'] += 1
-
-    def define(self, statement):
-        global d
-        d['C'] += F'#define {self.name} {statement}'
-        #d['A'] += F'#define {self.name} {statement}'
-
-    def declare(self, predication=False):
-        global d
-
-        if self.predication == False:
-            d['C'] += F'    Simd {self.name};               \\\n'
-
-            predtype = 'svfloat64_t'
-            if PRECISION == 'single':
-                predtype = 'svfloat32_t'
-
-            d['I'] += F'    {predtype} {self.name};        \\\n'
-        else:
-            d['I'] += F'    svbool_t {self.name};        \\\n'
-        #d['A'] += F'#define {self.name} {self.asmreg} \n'
-
-    def loadpredication(self, target='A'):
-        global d
-        if (target == 'A'):
-            d['A'] += F'    "ptrue {self.asmregwithsuffix} \\n\\t" \\\n'
-            d['asmclobber'].append(F'"{self.asmreg}"')
-
-    def loadtable(self, t):
-        global d
-        d['load'] += d['factor']
-        gpr = d['asmtableptr']
-
-        cast = 'uint64_t'
-        #asm_opcode = 'ld1d'
-        #if PRECISION == 'single':
-        #   asm_opcode = 'ld1w'
-        #    cast = 'uint32_t'
-        asm_opcode = 'ldr'
-        if PRECISION == 'single':
-            asm_opcode = 'ldr'
-            cast = 'uint32_t'
-
-        d['I'] += F'    {self.name} = svld1(pg1, ({cast}*)&lut[{t}]);  \\\n'
-
-        # using immediate index break-out works
-        if asm_opcode == 'ldr':
-            # ldr version
-            d['A'] += F'    "{asm_opcode} {self.asmreg}, [%[tableptr], %[index], mul vl] \\n\\t" \\\n'
-        else:
-            # ld1 version
-            d['A'] += F'    "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n'
-
-        d['asminput'].append(F'[tableptr] "r" (&lut[0])')
-        d['asminput'].append(F'[index] "i" ({t})')
-        d['asmclobber'].append(F'"memory"')
-        d['asmclobber'].append(F'"cc"')
-
-    def load(self, address, target='ALL', cast='float64_t', colors=3, offset=FETCH_BASE_PTR_COLOR_OFFSET):
-        global d
-        d['load'] += d['factor']
-        indices = re.findall(r'\d+', address)
-        index = (int(indices[0]) - offset) * colors + int(indices[1])
-
-        #asm_opcode = 'ld1d'
-        #if PRECISION == 'single':
-        #asm_opcode = 'ld1w'
-        #    cast = 'float32_t'
-
-        asm_opcode = 'ldr'
-        if PRECISION == 'single':
-            asm_opcode = 'ldr'
-            cast = 'float32_t'
-
-        gpr = d['asmfetchbaseptr']
-        intrinfetchbase = d['intrinfetchbase']
-        if (target in ['ALL', 'C']):
-            d['C'] += F'    {self.name} = {address};        \\\n'
-        if (target in ['ALL', 'I']):
-#            d['I'] += F'    {self.name} = svldnt1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64));  \\\n'
-            d['I'] += F'    {self.name} = svld1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64));  \\\n'
-        if (target in ['ALL', 'A']):
-            if asm_opcode == 'ldr':
-                d['A'] += F'    "{asm_opcode} {self.asmreg}, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n'
-            else:
-                d['A'] += F'    "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n'
-
-    def store(self, address, cast='float64_t', colors=3, offset=STORE_BASE_PTR_COLOR_OFFSET):
-        global d
-        d['store'] += d['factor']
-        indices = re.findall(r'\d+', address)
-        index = (int(indices[0]) - offset) * colors + int(indices[1])
-
-        #asm_opcode = 'stnt1d'
-        #if PRECISION == 'single':
-        #    asm_opcode = 'stnt1w'
-        #    cast = 'float32_t'
-        asm_opcode = 'str'
-        if PRECISION == 'single':
-            asm_opcode = 'str'
-            cast = 'float32_t'
-
-        intrinstorebase = d['intrinstorebase']
-
-        d['C'] += F'    {address} = {self.name};        \\\n'
-        #d['I'] += F'    svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name});  \\\n'
-        d['I'] += F'    svst1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name});  \\\n'
-        if asm_opcode == 'str':
-            d['A'] += F'    "{asm_opcode} {self.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n'
-        else:
-            d['A'] += F'    "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n'
-
-    def movestr(self, str):
-        global d
-        #d['move'] += d['factor']
-        d['I'] += F'    {self.name} = {str};        \\\n'
-
-    def move(self, op1):
-        global d
-        d['move'] += d['factor']
-        d['C'] += F'    {self.name} = {op1.name};   \\\n'
-        d['I'] += F'    {self.name} = {op1.name};        \\\n'
-        d['A'] += F'    "mov {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n'
-
-    # a = a + b , a = b + c
-    def add(self, op1, op2=None):
-        global d
-        d['add'] += d['factor']
-        if op2 is None:
-            d['C'] += F'    {self.name} = {self.name} + {op1.name};   \\\n'
-            d['I'] += F'    {self.name} = svadd_x(pg1, {self.name}, {op1.name}); \\\n'
-            d['A'] += F'    "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t"  \\\n'
-        else:
-            d['C'] += F'    {self.name} = {op1.name} + {op2.name};    \\\n'
-            d['I'] += F'    {self.name} = svadd_x(pg1, {op1.name}, {op2.name});  \\\n'
-            d['A'] += F'    "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t"  \\\n'
-
-    # a = a -b , a = b - c
-    def sub(self, op1, op2=None):
-        global d
-        d['sub'] += d['factor']
-        if op2 is None:
-            d['C'] += F'    {self.name} = {self.name} - {op1.name};    \\\n'
-            d['I'] += F'    {self.name} = svsub_x(pg1, {self.name}, {op1.name}); \\\n'
-            d['A'] += F'    "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n'
-        else:
-            d['C'] += F'    {self.name} = {op1.name} - {op2.name};     \\\n'
-            d['I'] += F'    {self.name} = svsub_x(pg1, {op1.name}, {op2.name});  \\\n'
-            d['A'] += F'    "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n'
-
-    # a = a * b , a = b * c
-    def mul(self, op1, op2):
-        global d
-        d['mul'] += 2 * d['factor']
-        d['C'] += F'    {self.name} = {op1.name} * {op2.name};                \\\n'
-        d['I'] += F'    {self.name} = __svzero({self.name}); \\\n'
-        d['I'] += F'    {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n'
-        d['I'] += F'    {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n'
-        d['A'] += F'    "mov {self.asmregwithsuffix} , 0 \\n\\t" \\\n'
-        d['A'] += F'    "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n'
-        d['A'] += F'    "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n'
-
-    def mul0(self, op1, op2, op3=None, constructive=False):
-        global d
-        d['mul'] += d['factor']
-
-        # no movprfx intrinsics support
-        if constructive == True:
-            d['movprfx'] += d['factor']
-            d['I'] += F'    {self.name} = svcmla_x(pg1, {op1.name}, {op2.name}, {op3.name}, 0); \\\n'
-            d['A'] += F'    "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n'
-            d['A'] += F'    "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op2.asmregwithsuffix}, {op3.asmregwithsuffix}, 0 \\n\\t" \\\n'
-        else:
-            d['C'] += F'    {self.name} = {op1.name} * {op2.name};                \\\n'
-            d['I'] += F'    {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n'
-            d['A'] += F'    "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n'
-
-    def mul1(self, op1, op2):
-        global d
-        d['mul'] += d['factor']
-        d['I'] += F'    {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n'
-        d['A'] += F'    "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n'
-
-    def mac(self, op1, op2):
-        global d
-        d['mac'] += 2 * d['factor']
-        d['C'] += F'    {self.name} = {self.name} + {op1.name} * {op2.name};    \\\n'
-        d['I'] += F'    {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n'
-        d['I'] += F'    {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n'
-        d['A'] += F'    "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n'
-        d['A'] += F'    "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n'
-
-    def mac0(self, op1, op2):
-        global d
-        d['mac'] += d['factor']
-        d['C'] += F'    {self.name} = {self.name} + {op1.name} * {op2.name};    \\\n'
-        d['I'] += F'    {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n'
-        d['A'] += F'    "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n'
-
-    def mac1(self, op1, op2):
-        global d
-        d['mac'] += d['factor']
-        d['I'] += F'    {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n'
-        d['A'] += F'    "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n'
-
-    def zero(self, zeroreg=False):
-        d['zero'] += d['factor']
-        d['C'] += F'    {self.name} = 0; \\\n'
-        #d['I'] += F'    {self.name} = __svzero({self.name}); \\\n'   only armclang
-
-        if PRECISION == 'double':
-            d['I'] += F'    {self.name} = svdup_f64(0.); \\\n'
-        else:
-            d['I'] += F'    {self.name} = svdup_f32(0.); \\\n'
-
-        if zeroreg == True:
-            d['A'] += F'    "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n'
-        else:
-            #using mov z, zero0      issue 1c, FLA, latency 6c
-            #d['A'] += F'    "mov {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n'
-
-            #using mov z, 0      issue 1c, FLA, latency 6c
-            d['A'] += F'    "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n'
-
-            #using xor z, z, z   issue 0.5c, FL*, latency 4c
-            #d['A'] += F'    "eor  {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n'
-
-            #using and z, z, zero0  issue 0.5c, FL*, latency 4c
-            #d['A'] += F'    "and {self.asmregwithsuffix}, {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n'
-
-            #using sub z, z, z    issue 0.5c, FL*, latency 9c
-            #d['A'] += F'    "sub  {self.asmregwithsuffix}, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n'
-
-    # without table
-    def timesI(self, op1, tempreg=None, tablereg=None):
-        global d
-        d['timesI'] += d['factor']
-        d['C'] += F'    {self.name} = timesI({op1.name});    \\\n'
-        # correct if DEBUG enabled, wrong if DEBUG disabled; no idea what's causing this
-        #table.load('table2', target='I', cast='uint64_t')
-        #d['I'] += F'    {self.name} = svtbl({op1.name}, {tablereg.name});  \\\n'
-        #d['I'] += F'    {self.name} = svneg_x(pg2, {self.name});  \\\n'
-        # timesI using trn tested, works but tbl should be faster
-        d['I'] += F'    {tempreg.name} = svtrn2({op1.name}, {op1.name});   \\\n'
-        d['I'] += F'    {tempreg.name} = svneg_x(pg1, {tempreg.name});   \\\n'
-        d['I'] += F'    {self.name} = svtrn1({tempreg.name}, {op1.name});   \\\n'
-        d['A'] += F'    "trn2 {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t"   \\\n'
-        d['A'] += F'    "fneg {tempreg.asmregwithsuffix}, {pg1.asmreg}/m, {tempreg.asmregwithsuffix} \\n\\t"   \\\n'
-        d['A'] += F'    "trn1 {self.asmregwithsuffix}, {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t"   \\\n'
-
-    def addTimesI(self, op1, op2=None, constructive=False):
-        global d
-        d['addTimesI'] += d['factor']
-
-        if op2 is None:
-            d['C'] += F'    {self.name} = {self.name} + timesI({op1.name});    \\\n'
-        else:
-            d['C'] += F'    {self.name} = {op1.name} + timesI({op2.name});    \\\n'
-
-        # no movprfx intrinsics support
-        if constructive == True:
-            d['movprfx'] += d['factor']
-            d['I'] += F'    {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90);   \\\n'
-            d['A'] += F'    "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n'
-            d['A'] += F'    "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n'
-        else:
-            if op2 is None:
-                d['C'] += F'    {self.name} = {self.name} + timesI({op1.name});    \\\n'
-                d['I'] += F'    {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 90);   \\\n'
-                d['A'] += F'    "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 90 \\n\\t" \\\n'
-            else:
-                d['C'] += F'    {self.name} = {op1.name} + timesI({op2.name});    \\\n'
-                d['I'] += F'    {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90);   \\\n'
-                d['A'] += F'    "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n'
-
-    def subTimesI(self, op1, op2=None, constructive=False):
-        global d
-        d['subTimesI'] += d['factor']
-
-        # no movprfx intrinsics support
-        if constructive == True:
-            d['movprfx'] += d['factor']
-            d['I'] += F'    {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270);   \\\n'
-            d['A'] += F'    "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n'
-            d['A'] += F'    "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n'
-        else:
-            if op2 is None:
-                d['C'] += F'    {self.name} = {self.name} - timesI({op1.name});    \\\n'
-                d['I'] += F'    {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 270);   \\\n'
-                d['A'] += F'    "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 270 \\n\\t" \\\n'
-            else:
-                d['C'] += F'    {self.name} = {op1.name} - timesI({op2.name});    \\\n'
-                d['I'] += F'    {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270);   \\\n'
-                d['A'] += F'    "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n'
-
-    # timesMinusI is not used, def is probably wrong !!!!  OPTIMIZATION with table
-    def timesMinusI(self, op1):
-        global d
-        d['timesMinusI'] += d['factor']
-        d['C'] += F'    {self.name} = timesMinusI({self.name});    \\\n'
-        d['I'] += F'    {self.name} = svtrn1({op1.name}, {op1.name});   \\\n'
-        d['I'] += F'    {self.name} = svneg_x(pg1, {self.name});   \\\n'
-        d['I'] += F'    {self.name} = svtrn1({op1.name}, {self.name});   \\\n'
-
-    def permute(self, dir, tablereg=None):
-        global d
-        d['permutes'] += d['factor']
-
-        d['C'] += F'    permute{dir}({self.name}, {self.name});    \\\n'
-
-        d['I'] += F'    {self.name} = svtbl({self.name}, {tablereg.name});    \\\n'
-        d['A'] += F'    "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t"  \\\n'
-
-        # if dir == 0:
-        #     d['I'] += F'    {self.name} = svext({self.name}, {self.name}, 4); \\\n'
-        #     # this might not work, see intrinsics assembly
-        #     # d['A'] += F'    ext {self.name}, {self.name}, {self.name}, #4 \\\n'
-        #     # use registers directly
-        #     d['A'] += F'    "ext {self.asmregbyte}, {self.asmregbyte}, {self.asmregbyte}, 32 \\n\\t" \\\n'
-        #
-        # elif dir in [1, 2]:
-        #     d['I'] += F'    {self.name} = svtbl({self.name}, {tablereg.name});    \\\n'
-        #     d['A'] += F'    "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t"  \\\n'
-
-    def debug(self):
-        global d
-        typecast = d['cfloat']
-        gpr = d['asmdebugptr']
-        vregs = d['asmclobberlist']
-        if (d['debug'] == True):
-            d['C'] += F'std::cout << "{self.name} -- " <<  {self.name} << std::endl; \\\n'
-
-            d['I'] += F'svst1(pg1, ({typecast}*)&debugreg.v, {self.name}); \\\n'
-            d['I'] += F'std::cout << "{self.name} -- " <<  debugreg << std::endl; \\\n'
-            #d['I'] += F'std::cout << "{self.name} -- " <<  {self.name} << std::endl; \\\n'
-
-            d['A'] += F'asm ( \\\n'
-            d['A'] += F'    " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n'   # memory barrier
-            d['A'] += F'    "str {self.asmreg}, [%[ptr]] \\n\\t" \\\n'
-            d['A'] += F'    " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n'   # memory barrier
-            d['A'] += F'    : "=m" (debugreg.v)  \\\n'
-            d['A'] += F'    : [ptr] "r" (&debugreg.v)  \\\n'
-            d['A'] += F'    : "p5", "cc", "memory" \\\n'
-            d['A'] += F'); \\\n'
-            d['A'] += F'std::cout << "{self.name} -- " <<  debugreg << std::endl; \\\n'
-            # this form of addressing is not valid!
-            #d['A'] += F'    "str {self.asmreg}, %[ptr] \\n\\t" \\\n'
-# end Register
-
-def define(s, target='ALL'):
-    x = F'#define {s}  \n'
-    global d
-    if (target in ['ALL', 'C']):
-        d['C'] += x
-    if (target in ['ALL', 'I']):
-        d['I'] += x
-    if (target in ['ALL', 'A']):
-        d['A'] += x
-
-def definemultiline(s):
-    x = F'#define {s}  \\\n'
-    global d
-    d['C'] += x
-    d['I'] += x
-    d['A'] += x
-
-def write(s, target='ALL'):
-    x = F'{s}\n'
-    global d
-    if (target in ['ALL', 'C']):
-        d['C'] += x
-    if (target in ['ALL', 'I']):
-        d['I'] += x
-    if (target in ['ALL', 'A']):
-        d['A'] += x
-
-def curlyopen():
-    write(F'{{ \\')
-
-def curlyclose():
-    write(F'}}')
-
-def newline(target='ALL'):
-    global d
-
-    if target == 'A':
-        if d['A'][-2:] == '\\\n':
-            d['A'] = d['A'][:-2] + '\n\n'
-    else:
-        if d['C'][-2:] == '\\\n':
-            d['C'] = d['C'][:-2] + '\n\n'
-        if d['I'][-2:] == '\\\n':
-            d['I'] = d['I'][:-2] + '\n\n'
-        if d['A'][-2:] == '\\\n':
-            d['A'] = d['A'][:-2] + '\n\n'
-
-# load the base pointer for fetches
-def fetch_base_ptr(address, target='A'):
-    global d
-    #d['load'] += d['factor']
-
-    # DEBUG
-    #colors=3
-    #indices = re.findall(r'\d+', address)
-    #index = (int(indices[0]) - FETCH_BASE_PTR_COLOR_OFFSET) * colors + int(indices[1])
-    #print(F'{address}                      (base)')
-
-    vregs = d['asmclobberlist']
-    if target == 'A':
-        d['asminput'].append(F'[fetchptr] "r" ({address})')
-        d['asmclobber'].extend(vregs)
-        d['asmclobber'].append(F'"memory"')
-        d['asmclobber'].append(F'"cc"')
-    if target == 'I':
-        #print("intrinfetchbase = ", address)
-        d['intrinfetchbase'] = address
-
-# load the base pointer for stores
-def store_base_ptr(address, target='A'):
-    global d
-    #d['load'] += d['factor']
-    gpr = d['asmstorebaseptr']
-    vregs = d['asmclobberlist']
-    if target == 'A':
-        d['asminput'].append(F'[storeptr] "r" ({address})')
-        d['asmclobber'].extend(vregs)
-        d['asmclobber'].append(F'"memory"')
-        d['asmclobber'].append(F'"cc"')
-    if target == 'I':
-        d['intrinstorebase'] = address
-
-def prefetch_L1(address, offset):
-    global d
-    multiplier = 4  # offset in CL, have to multiply by 4
-    policy = "PLDL1STRM"     # weak
-    #policy = "PLDL1KEEP"     # strong
-
-    d['I'] += F'    svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n'
-    d['A'] += F'    "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n'
-
-def prefetch_L2(address, offset):
-    global d
-    multiplier = 4  # offset in CL, have to multiply by 4
-    policy = "PLDL2STRM"     # weak
-    #policy = "PLDL2KEEP"     # strong
-
-    d['I'] += F'    svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n'
-    d['A'] += F'    "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n'
-    #d['A'] +=
-
-def prefetch_L2_store(address, offset):
-    global d
-    multiplier = 4  # offset in CL, have to multiply by 4
-    policy = "PSTL2STRM"     # weak
-    #policy = "PSTL2KEEP"     # strong
-
-    d['I'] += F'    svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n'
-    d['A'] += F'    "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n'
-
-def prefetch_L1_store(address, offset):
-    global d
-    multiplier = 4  # offset in CL, have to multiply by 4
-    policy = "PSTL1STRM"     # weak
-    #policy = "PSTL2KEEP"     # strong
-
-    d['I'] += F'    svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n'
-    d['A'] += F'    "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n'
-
-
-def asmopen():
-    #write('asm volatile ( \\', target='A')
-    write('asm ( \\', target='A')
-
-    # DEBUG
-    #write(F'    " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A')   # memory barrier
-    #write('asm volatile ( \\', target='A')
-
-def asmclose():
-    global d
-
-    #print(d['asminput'])
-
-    asmin  = d['asminput']
-    asmin_s = ''
-    if len(asmin) > 0:
-        asmin = list(dict.fromkeys(asmin))   # remove duplicates
-        #print(asmin)
-        for el in asmin:
-            asmin_s += el + ','
-        asmin_s = asmin_s[:-1]
-        #print("-> ", asmin_s)
-
-    d['asminput'] = []
-
-    asmout = d['asmoutput']
-    asmout_s = ''
-    if len(asmout) > 0:
-        asmout = list(dict.fromkeys(asmout))   # remove duplicates
-        for el in asmout:
-            asmout_s += el + ','
-        asmout_s = asmout_s[:-1]
-
-    d['asmoutput'] = []
-
-    # DEBUG  put all regs into clobber by default
-    d['asmclobber'].extend(d['asmclobberlist'])
-
-    asmclobber = d['asmclobber']
-    asmclobber_s = ''
-    #print(asmclobber)
-    if len(asmclobber) > 0:
-        asmclobber = list(dict.fromkeys(asmclobber))   # remove duplicates
-        for el in asmclobber:
-            asmclobber_s += el + ','
-        asmclobber_s = asmclobber_s[:-1]
-
-    d['asmclobber'] = []
-
-    # DEBUG
-    #write(F'    " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A')   # memory barrier
-
-
-    write(F'    : {asmout_s} \\', target='A')
-    write(F'    : {asmin_s} \\', target='A')
-    write(F'    : {asmclobber_s} \\', target='A')
-    write('); \\', target='A')
-
-# --------------------------------------------------------------------------------
-
-# string of vector registers to be used in clobber list
-#clobberlist = ['"p0"']
-clobberlist = ['"p5"']
-clobberlist.append('"cc"')
-for i in range(0, 32):
-    clobberlist.append(F'"z{i}"')
-
-d = {
-'debug': _DEBUG,
-'C': '',
-'I': '',
-'A': '',
-'asmsuffix': '.d',      # double precision by default
-'cfloat': 'float64_t',
-'registers': 0,
-'load': 0,
-'store': 0,
-'move': 0,
-'movprfx': 0,
-'zero': 0,
-'add': 0,
-'sub': 0,
-'mul': 0,
-'mac': 0,
-'permutes': 0,
-'neg': 0,
-'addTimesI': 0,
-'subTimesI': 0,
-'timesI': 0,
-'timesMinusI': 0,
-'flops': 0,
-'factor': 1,                    # multiplicity
-'asmtableptr': 'x30',
-'asmfetchbaseptr': 'x29',
-'asmstorebaseptr': 'x28',
-'asmdebugptr': 'r12',
-'asminput': [],
-'asmoutput': [],
-'asmclobber': [],
-'asmclobberlist': clobberlist,
-'intrinfetchbase': '',
-'intrinstorebase': '',
-'cycles_LOAD_CHIMU': 0,
-'cycles_PROJ': 0,
-'cycles_PERM': 0,
-'cycles_MULT_2SPIN': 0,
-'cycles_RECON': 0,
-'cycles_RESULT': 0,
-'cycles_ZERO_PSI': 0,
-'cycles_PREFETCH_L1': 0,
-'cycles_PREFETCH_L2': 0
-}
-
-if PRECISION == 'single':
-    d['asmsuffix'] = '.s'
-    d['cfloat'] = 'float32_t'
-
-# --------------------------------------------------------------------------------
-# Grid
-# --------------------------------------------------------------------------------
-
-# Variables / Registers
-result_00 = Register('result_00', asmreg='z0')
-result_01 = Register('result_01', asmreg='z1')
-result_02 = Register('result_02', asmreg='z2')
-result_10 = Register('result_10', asmreg='z3')
-result_11 = Register('result_11', asmreg='z4')
-result_12 = Register('result_12', asmreg='z5')
-result_20 = Register('result_20', asmreg='z6')
-result_21 = Register('result_21', asmreg='z7')
-result_22 = Register('result_22', asmreg='z8')
-result_30 = Register('result_30', asmreg='z9')
-result_31 = Register('result_31', asmreg='z10')
-result_32 = Register('result_32', asmreg='z11')       # 12 Regs
-Chi_00	  = Register('Chi_00', asmreg='z12')
-Chi_01	  = Register('Chi_01', asmreg='z13')
-Chi_02	  = Register('Chi_02', asmreg='z14')
-Chi_10	  = Register('Chi_10', asmreg='z15')
-Chi_11	  = Register('Chi_11', asmreg='z16')
-Chi_12	  = Register('Chi_12', asmreg='z17')          # 6
-UChi_00   = Register('UChi_00', asmreg='z18')
-UChi_01   = Register('UChi_01', asmreg='z19')
-UChi_02   = Register('UChi_02', asmreg='z20')
-UChi_10   = Register('UChi_10', asmreg='z21')
-UChi_11   = Register('UChi_11', asmreg='z22')
-UChi_12   = Register('UChi_12', asmreg='z23')         # 6
-U_00	  = Register('U_00', asmreg='z24')
-U_10	  = Register('U_10', asmreg='z25')
-U_20	  = Register('U_20', asmreg='z26')
-U_01	  = Register('U_01', asmreg='z27')
-U_11	  = Register('U_11', asmreg='z28')
-U_21      = Register('U_21', asmreg='z29')            # 6    -> 30 Registers
-
-table0    = Register('table0', asmreg='z30')
-zero0     = Register('zero0', asmreg='z31')           # 2    -> 32 Registers
-# can't overload temp1 / table due to type mismatch using intrinsics :(
-# typecasting SVE intrinsics variables is not allowed
-
-pg1       = Register('pg1', predication=True, asmreg='p5')
-#pg2       = Register('pg2', predication=True, asmreg='p1')
-
-# Overloaded with Chi_* and UChi_*
-Chimu_00  = Register('Chimu_00', asmreg=Chi_00.asmreg)
-Chimu_01  = Register('Chimu_01', asmreg=Chi_01.asmreg)
-Chimu_02  = Register('Chimu_02', asmreg=Chi_02.asmreg)
-Chimu_10  = Register('Chimu_10', asmreg=Chi_10.asmreg)
-Chimu_11  = Register('Chimu_11', asmreg=Chi_11.asmreg)
-Chimu_12  = Register('Chimu_12', asmreg=Chi_12.asmreg)
-if ALTERNATIVE_REGISTER_MAPPING == False:
-    Chimu_20  = Register('Chimu_20', asmreg=UChi_00.asmreg)
-    Chimu_21  = Register('Chimu_21', asmreg=UChi_01.asmreg)
-    Chimu_22  = Register('Chimu_22', asmreg=UChi_02.asmreg)
-    Chimu_30  = Register('Chimu_30', asmreg=UChi_10.asmreg)
-    Chimu_31  = Register('Chimu_31', asmreg=UChi_11.asmreg)
-    Chimu_32  = Register('Chimu_32', asmreg=UChi_12.asmreg)        # 12 Registers
-else: # wilson4.h
-    Chimu_20  = Register('Chimu_20', asmreg=U_00.asmreg)
-    Chimu_21  = Register('Chimu_21', asmreg=U_10.asmreg)
-    Chimu_22  = Register('Chimu_22', asmreg=U_20.asmreg)
-    Chimu_30  = Register('Chimu_30', asmreg=U_01.asmreg)
-    Chimu_31  = Register('Chimu_31', asmreg=U_11.asmreg)
-    Chimu_32  = Register('Chimu_32', asmreg=U_21.asmreg)
-
-# debugging output
-def debugall(msg=None, group='ALL'):
-    global d
-    if (d['debug'] == False):
-        return
-    write(F'std::cout << std::endl << "DEBUG -- {msg}" << std::endl; \\')
-    if (group in ['ALL', 'result']):
-        result_00.debug()
-        result_01.debug()
-        result_02.debug()
-        result_10.debug()
-        result_11.debug()
-        result_12.debug()
-        result_20.debug()
-        result_21.debug()
-        result_22.debug()
-        result_30.debug()
-        result_31.debug()
-        result_32.debug()
-    if (group in ['ALL', 'Chi']):
-        Chi_00.debug()
-        Chi_01.debug()
-        Chi_02.debug()
-        Chi_10.debug()
-        Chi_11.debug()
-        Chi_12.debug()
-    if (group in ['ALL', 'UChi']):
-        UChi_00.debug()
-        UChi_01.debug()
-        UChi_02.debug()
-        UChi_10.debug()
-        UChi_11.debug()
-        UChi_12.debug()
-    if (group in ['ALL', 'U']):
-        U_00.debug()
-        U_10.debug()
-        U_20.debug()
-        U_01.debug()
-        U_11.debug()
-        U_21.debug()
-    if (group in ['ALL', 'Chimu']):
-        Chimu_00.debug()
-        Chimu_01.debug()
-        Chimu_02.debug()
-        Chimu_10.debug()
-        Chimu_11.debug()
-        Chimu_12.debug()
-        Chimu_20.debug()
-        Chimu_21.debug()
-        Chimu_22.debug()
-        Chimu_30.debug()
-        Chimu_31.debug()
-        Chimu_32.debug()
-
-# --------------------------------------------------------------------------------
-# Output
-# --------------------------------------------------------------------------------
-
-if ALTERNATIVE_LOADS == True:
-    define(F'LOAD_CHIMU_0213_PLUG    LOAD_CHIMU_0213_{PRECSUFFIX}')
-    define(F'LOAD_CHIMU_0312_PLUG    LOAD_CHIMU_0312_{PRECSUFFIX}')
-    define(F'LOAD_CHIMU(x)')
-else:
-    #define(F'LOAD_CHIMU_{PRECSUFFIX}(x)           LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)')
-    define(F'LOAD_CHIMU(base)               LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)')
-
-if PREFETCH:
-    define(F'PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)')
-    define(F'PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)')
-    define(F'PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(A)')
-    define(F'PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)')
-    define(F'PF_GAUGE(A)')
-    define(F'PREFETCH_RESULT_L2_STORE(A)    PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(A)')
-    define(F'PREFETCH_RESULT_L1_STORE(A)    PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(A)')
-    define(F'PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)')
-#    define(F'PREFETCH1_CHIMU(A)')
-    define(F'PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)')
-#    define(F'PREFETCH_CHIMU(A)')
-else:
-    define(F'PREFETCH_CHIMU_L1(A)')
-    define(F'PREFETCH_GAUGE_L1(A)')
-    define(F'PREFETCH_CHIMU_L2(A)')
-    define(F'PREFETCH_GAUGE_L2(A)')
-    define(F'PF_GAUGE(A)')
-    define(F'PREFETCH1_CHIMU(A)')
-    define(F'PREFETCH_CHIMU(A)')
-    define(F'PREFETCH_RESULT_L2_STORE(A)')
-
-# standard defines
-define(F'LOCK_GAUGE(A)')
-define(F'UNLOCK_GAUGE(A)')
-define(F'MASK_REGS                      DECLARATIONS_{PRECSUFFIX}')
-define(F'SAVE_RESULT(A,B)               RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)')
-define(F'MULT_2SPIN_1(Dir)              MULT_2SPIN_1_{PRECSUFFIX}(Dir)')
-define(F'MULT_2SPIN_2                   MULT_2SPIN_2_{PRECSUFFIX}')
-define(F'LOAD_CHI(base)                 LOAD_CHI_{PRECSUFFIX}(base)')
-# don't need zero psi, everything is done in recons
-#define(F'ZERO_PSI                       ZERO_PSI_{PRECSUFFIX}')
-define(F'ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)')
-# loads projections
-define(F'XP_PROJ                        XP_PROJ_{PRECSUFFIX}')
-define(F'YP_PROJ                        YP_PROJ_{PRECSUFFIX}')
-define(F'ZP_PROJ                        ZP_PROJ_{PRECSUFFIX}')
-define(F'TP_PROJ                        TP_PROJ_{PRECSUFFIX}')
-define(F'XM_PROJ                        XM_PROJ_{PRECSUFFIX}')
-define(F'YM_PROJ                        YM_PROJ_{PRECSUFFIX}')
-define(F'ZM_PROJ                        ZM_PROJ_{PRECSUFFIX}')
-define(F'TM_PROJ                        TM_PROJ_{PRECSUFFIX}')
-# recons
-define(F'XP_RECON                       XP_RECON_{PRECSUFFIX}')
-define(F'XM_RECON                       XM_RECON_{PRECSUFFIX}')
-define(F'XM_RECON_ACCUM                 XM_RECON_ACCUM_{PRECSUFFIX}')
-define(F'YM_RECON_ACCUM                 YM_RECON_ACCUM_{PRECSUFFIX}')
-define(F'ZM_RECON_ACCUM                 ZM_RECON_ACCUM_{PRECSUFFIX}')
-define(F'TM_RECON_ACCUM                 TM_RECON_ACCUM_{PRECSUFFIX}')
-define(F'XP_RECON_ACCUM                 XP_RECON_ACCUM_{PRECSUFFIX}')
-define(F'YP_RECON_ACCUM                 YP_RECON_ACCUM_{PRECSUFFIX}')
-define(F'ZP_RECON_ACCUM                 ZP_RECON_ACCUM_{PRECSUFFIX}')
-define(F'TP_RECON_ACCUM                 TP_RECON_ACCUM_{PRECSUFFIX}')
-# new permutes
-define(F'PERMUTE_DIR0                   0')
-define(F'PERMUTE_DIR1                   1')
-define(F'PERMUTE_DIR2                   2')
-define(F'PERMUTE_DIR3                   3')
-define(F'PERMUTE                        PERMUTE_{PRECSUFFIX};')
-# load table
-#define(F'MAYBEPERM(A,perm)              if (perm) {{ A ; }}')
-if PRECISION == 'double':
-    define(F'LOAD_TABLE(Dir)                if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}')
-    define(F'MAYBEPERM(Dir,perm)            if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}')
-else:
-    define(F'LOAD_TABLE(Dir)                if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}')
-    define(F'MAYBEPERM(A,perm)              if (perm) {{ PERMUTE; }}')
-
-
-
-write('// DECLARATIONS')
-definemultiline(F'DECLARATIONS_{PRECSUFFIX}')
-# debugging register
-if d['debug'] == True:
-    write('    Simd debugreg; \\')
-# perm tables
-if PRECISION == 'double':
-    write('    const uint64_t lut[4][8] = { \\')
-    write('        {4, 5, 6, 7, 0, 1, 2, 3}, \\')  #0 = swap register halves
-    write('        {2, 3, 0, 1, 6, 7, 4, 5}, \\')  #1 = swap halves of halves
-    write('        {1, 0, 3, 2, 5, 4, 7, 6}, \\')  #2 = swap re/im
-    write('        {0, 1, 2, 4, 5, 6, 7, 8} };\\')  #3 = identity
-else:
-    write('    const uint32_t lut[4][16] = { \\')
-    write('        {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \\')  #0 = swap register halves
-    write('        {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \\')  #1 = swap halves of halves
-    write('        {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \\')  #2 = swap halves of halves of halves
-    write('        {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \\') #3 = swap re/im
-
-#newline(target='A')
-result_00.declare()
-result_01.declare()
-result_02.declare()
-result_10.declare()
-result_11.declare()
-result_12.declare()
-result_20.declare()
-result_21.declare()
-result_22.declare()
-result_30.declare()
-result_31.declare()
-result_32.declare()     # 12
-Chi_00.declare()
-Chi_01.declare()
-Chi_02.declare()
-Chi_10.declare()
-Chi_11.declare()
-Chi_12.declare()        # 6
-UChi_00.declare()
-UChi_01.declare()
-UChi_02.declare()
-UChi_10.declare()
-UChi_11.declare()
-UChi_12.declare()       # 6
-U_00.declare()
-U_10.declare()
-U_20.declare()
-U_01.declare()
-U_11.declare()
-U_21.declare()          # 6   -> 30 regs
-
-# all predications true
-pg1.declare()
-if PRECISION == 'double':
-    pg1.movestr('svptrue_b64()')
-else:
-    pg1.movestr('svptrue_b32()')
-
-# tables
-if PRECISION == 'double':
-    write('    svuint64_t table0; \\', target='I')   #      -> 31 regs
-else:
-    write('    svuint32_t table0; \\', target='I')   #      -> 31 regs
-
-zero0.declare()
-
-# zero register
-asmopen()
-zero0.zero(zeroreg=True)
-asmclose()
-newline()
-
-define('Chimu_00 Chi_00', target='I')
-define('Chimu_01 Chi_01', target='I')
-define('Chimu_02 Chi_02', target='I')
-define('Chimu_10 Chi_10', target='I')
-define('Chimu_11 Chi_11', target='I')
-define('Chimu_12 Chi_12', target='I')
-if ALTERNATIVE_REGISTER_MAPPING == False:
-    define('Chimu_20 UChi_00', target='I')
-    define('Chimu_21 UChi_01', target='I')
-    define('Chimu_22 UChi_02', target='I')
-    define('Chimu_30 UChi_10', target='I')
-    define('Chimu_31 UChi_11', target='I')
-    define('Chimu_32 UChi_12', target='I')
-else: # wilson4.h
-    define('Chimu_20 U_00', target='I')
-    define('Chimu_21 U_10', target='I')
-    define('Chimu_22 U_20', target='I')
-    define('Chimu_30 U_01', target='I')
-    define('Chimu_31 U_11', target='I')
-    define('Chimu_32 U_21', target='I')
-newline()
-
-
-d['cycles_RESULT'] += 12
-write('// RESULT')
-definemultiline(F'RESULT_{PRECSUFFIX}(base)')
-if ASM_STORE:
-    curlyopen()
-    #write('    SiteSpinor & ref(out[ss]); \\')
-    asmopen()
-    #pg1.loadpredication()
-    #store_base_ptr("&ref[0][0]")
-    #store_base_ptr(F"&ref[{STORE_BASE_PTR_COLOR_OFFSET}][0]")
-    store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I')
-    store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A')
-    result_00.store("ref[0][0]")
-    result_01.store("ref[0][1]")
-    result_02.store("ref[0][2]")
-    result_10.store("ref[1][0]")
-    result_11.store("ref[1][1]")
-    result_12.store("ref[1][2]")
-    result_20.store("ref[2][0]")
-    result_21.store("ref[2][1]")
-    result_22.store("ref[2][2]")
-    result_30.store("ref[3][0]")
-    result_31.store("ref[3][1]")
-    result_32.store("ref[3][2]")
-    asmclose()
-    debugall('RESULT', group='result')
-    curlyclose()
-newline()
-
-# prefetch spinors from memory into L2 cache
-d['factor'] = 0
-d['cycles_PREFETCH_L2'] += 0 * d['factor']
-write('// PREFETCH_CHIMU_L2 (prefetch to L2)')
-definemultiline(F'PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(base)')
-curlyopen()
-fetch_base_ptr(F"base")
-asmopen()
-#pg1.loadpredication()
-#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]")
-fetch_base_ptr(F"base", target='A')
-prefetch_L2(F"base", 0)
-prefetch_L2(F"base", 1)
-prefetch_L2(F"base", 2)
-asmclose()
-curlyclose()
-newline()
-
-# prefetch spinors from memory into L1 cache
-d['factor'] = 0
-d['cycles_PREFETCH_L1'] += 0 * d['factor']
-write('// PREFETCH_CHIMU_L1 (prefetch to L1)')
-definemultiline(F'PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(base)')
-curlyopen()
-fetch_base_ptr(F"base")
-asmopen()
-#pg1.loadpredication()
-fetch_base_ptr(F"base", target='A')
-prefetch_L1(F"base", 0)
-prefetch_L1(F"base", 1)
-prefetch_L1(F"base", 2)
-asmclose()
-curlyclose()
-newline()
-
-# prefetch gauge from memory into L2 cache
-d['factor'] = 0
-d['cycles_PREFETCH_L2'] += 0 * d['factor']
-write('// PREFETCH_GAUGE_L2 (prefetch to L2)')
-definemultiline(F'PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)')
-curlyopen()
-if GRIDBENCH:   # referencing differs in Grid and GridBench
-    write('    const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\')
-else:
-    write('    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\')
-asmopen()
-#pg1.loadpredication()
-#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]")
-fetch_base_ptr(F"baseU", target='A')
-prefetch_L2(F"baseU", -1)
-prefetch_L2(F"baseU", 0)
-prefetch_L2(F"baseU", 1)
-prefetch_L2(F"baseU", 2)
-prefetch_L2(F"baseU", 3)
-prefetch_L2(F"baseU", 4)
-prefetch_L2(F"baseU", 5)
-prefetch_L2(F"baseU", 6)
-prefetch_L2(F"baseU", 7)
-#prefetch_L2(F"baseU", 8)
-asmclose()
-curlyclose()
-newline()
-
-# prefetch gauge from memory into L1 cache
-d['factor'] = 0
-d['cycles_PREFETCH_L1'] += 0 * d['factor']
-write('// PREFETCH_GAUGE_L1 (prefetch to L1)')
-definemultiline(F'PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)')
-curlyopen()
-if GRIDBENCH:   # referencing differs in Grid and GridBench
-    write('    const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\')
-else:
-    write('    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\')
-asmopen()
-#pg1.loadpredication()
-#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]")
-fetch_base_ptr(F"baseU", target='A')
-prefetch_L1(F"baseU", 0)
-prefetch_L1(F"baseU", 1)
-prefetch_L1(F"baseU", 2)
-asmclose()
-curlyclose()
-newline()
-
-d['factor'] = 0
-write('// LOAD_CHI')
-definemultiline(F'LOAD_CHI_{PRECSUFFIX}(base)')
-if ASM_LOAD_CHIMU:
-    curlyopen()
-    #write('    const SiteSpinor & ref(in[offset]); \\')
-    asmopen()
-    #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I')
-    #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A')
-    fetch_base_ptr(F"base", target='I')
-    fetch_base_ptr(F"base", target='A')
-
-    Chi_00.load("ref[0][0]", offset=0)
-    Chi_01.load("ref[0][1]", offset=0)
-    Chi_02.load("ref[0][2]", offset=0)
-    Chi_10.load("ref[1][0]", offset=0)
-    Chi_11.load("ref[1][1]", offset=0)
-    Chi_12.load("ref[1][2]", offset=0)
-    asmclose()
-    debugall('LOAD_CHI', group='Chi')
-    curlyclose()
-newline()
-
-
-
-d['factor'] = 8
-# 12 loads = 12 issues, load latency = 8+1 cycles
-# (not perfectly clear to me from docs)
-d['cycles_LOAD_CHIMU'] += 11 * d['factor']
-write('// LOAD_CHIMU')
-definemultiline(F'LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)')
-if ASM_LOAD_CHIMU:
-    curlyopen()
-    #write('    const SiteSpinor & ref(in[offset]); \\')
-    asmopen()
-    pg1.loadpredication()
-    #fetch_base_ptr("&ref[0][0]")
-    #fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]")
-    fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I')
-    fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A')
-    # Chimu_00.load("ref[0][0]")
-    # Chimu_01.load("ref[0][1]")
-    # Chimu_02.load("ref[0][2]")
-    # Chimu_10.load("ref[1][0]")
-    # Chimu_11.load("ref[1][1]")
-    # Chimu_12.load("ref[1][2]")
-    # Chimu_20.load("ref[2][0]")
-    # Chimu_21.load("ref[2][1]")
-    # Chimu_22.load("ref[2][2]")
-    # Chimu_30.load("ref[3][0]")
-    # Chimu_31.load("ref[3][1]")
-    # Chimu_32.load("ref[3][2]")
-
-    Chimu_00.load("ref[0][0]")  # minimum penalty for all directions
-    Chimu_30.load("ref[3][0]")
-    Chimu_10.load("ref[1][0]")
-    Chimu_20.load("ref[2][0]")
-
-    Chimu_01.load("ref[0][1]")
-    Chimu_31.load("ref[3][1]")
-    Chimu_11.load("ref[1][1]")
-    Chimu_21.load("ref[2][1]")
-
-    Chimu_02.load("ref[0][2]")
-    Chimu_32.load("ref[3][2]")
-    Chimu_12.load("ref[1][2]")
-    Chimu_22.load("ref[2][2]")
-    asmclose()
-    debugall('LOAD_CHIMU', group='Chimu')
-    curlyclose()
-newline()
-
-# alternative load chimu: dirac order 0213
-# placed into asm (...)
-d['factor'] = 0
-d['cycles_LOAD_CHIMU'] += 11 * d['factor']
-write('// LOAD_CHIMU_0213')
-definemultiline(F'LOAD_CHIMU_0213_{PRECSUFFIX}')
-if ASM_LOAD_CHIMU:
-    curlyopen()
-    write('    const SiteSpinor & ref(in[offset]); \\')
-    asmopen()
-    pg1.loadpredication()
-    fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]")
-    Chimu_00.load("ref[0][0]")  # reordered
-    Chimu_20.load("ref[2][0]")
-
-    Chimu_01.load("ref[0][1]")
-    Chimu_21.load("ref[2][1]")
-
-    Chimu_02.load("ref[0][2]")
-    Chimu_22.load("ref[2][2]")
-
-    Chimu_10.load("ref[1][0]")
-    Chimu_30.load("ref[3][0]")
-
-    Chimu_11.load("ref[1][1]")
-    Chimu_31.load("ref[3][1]")
-
-    Chimu_12.load("ref[1][2]")
-    Chimu_32.load("ref[3][2]")
-    asmclose()
-    debugall('LOAD_CHIMU_0213', group='Chimu')
-    curlyclose()
-newline()
-
-# alternative load chimu: dirac order 0312
-# placed into asm (...)
-d['factor'] = 0
-d['cycles_LOAD_CHIMU'] += 11 * d['factor']
-write('// LOAD_CHIMU_0312')
-definemultiline(F'LOAD_CHIMU_0312_{PRECSUFFIX}')
-if ASM_LOAD_CHIMU:
-    curlyopen()
-    write('    const SiteSpinor & ref(in[offset]); \\')
-    asmopen()
-    pg1.loadpredication()
-    fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]")
-    Chimu_00.load("ref[0][0]")  # reordered
-    Chimu_30.load("ref[3][0]")
-
-    Chimu_01.load("ref[0][1]")
-    Chimu_31.load("ref[3][1]")
-
-    Chimu_02.load("ref[0][2]")
-    Chimu_32.load("ref[3][2]")
-
-    Chimu_10.load("ref[1][0]")
-    Chimu_20.load("ref[2][0]")
-
-    Chimu_11.load("ref[1][1]")
-    Chimu_21.load("ref[2][1]")
-
-    Chimu_12.load("ref[1][2]")
-    Chimu_22.load("ref[2][2]")
-    asmclose()
-    debugall('LOAD_CHIMU_0312', group='Chimu')
-    curlyclose()
-newline()
-
-d['factor'] = 2
-d['cycles_PERM'] += 1 * d['factor']
-write('// LOAD_TABLE0')
-definemultiline(F'LOAD_TABLE0')
-asmopen()
-table0.loadtable(0)
-asmclose()
-newline()
-
-d['factor'] = 2
-d['cycles_PERM'] += 1 * d['factor']
-write('// LOAD_TABLE1')
-definemultiline(F'LOAD_TABLE1')
-asmopen()
-table0.loadtable(1)
-asmclose()
-newline()
-
-d['factor'] = 2
-d['cycles_PERM'] += 1 * d['factor']
-write('// LOAD_TABLE2')
-definemultiline(F'LOAD_TABLE2')
-asmopen()
-table0.loadtable(2)
-asmclose()
-newline()
-
-d['factor'] = 0
-d['cycles_PERM'] += 1 * d['factor']
-write('// LOAD_TABLE3')
-definemultiline(F'LOAD_TABLE3')
-asmopen()
-table0.loadtable(3)
-asmclose()
-newline()
-
-d['factor'] = 2     # factor is 2
-d['cycles_PERM'] += 6 * d['factor']
-write('// PERMUTE')
-definemultiline(F'PERMUTE_{PRECSUFFIX}')
-debugall('PERM PRE', group='Chi')
-asmopen()
-#table0.loadtable(2)
-Chi_00.permute(2, table0)
-Chi_01.permute(2, table0)
-Chi_02.permute(2, table0)
-Chi_10.permute(2, table0)
-Chi_11.permute(2, table0)
-Chi_12.permute(2, table0)
-asmclose()
-debugall('PERM POST', group='Chi')
-newline()
-
-write('// LOAD_GAUGE')
-definemultiline(F'LOAD_GAUGE')
-if GRIDBENCH:   # referencing differs in Grid and GridBench
-    write('    const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\')
-else:
-    write('    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\')
-curlyopen()
-asmopen()
-pg1.loadpredication()
-fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I')
-if ASM_LOAD_GAUGE:
-    fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A')
-    U_00.load("ref[0][0]")
-    U_10.load("ref[1][0]")
-    U_20.load("ref[2][0]")
-    U_01.load("ref[0][1]")
-    U_11.load("ref[1][1]")
-    U_21.load("ref[2][1]")
-asmclose()
-curlyclose()
-newline()
-
-d['factor'] = 8    # MULT_2SPIN executes 1 time per direction = 8 times total
-# assume all U loads are hidden
-# FCMLA issue latency = 2 cycles
-# measurement: latency = 16 cycles if FULLY pipelined !?
-# spec says 6+6+9 cycles
-# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9
-d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor']
-write('// MULT_2SPIN')
-definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)')
-curlyopen()
-#write('    const auto & ref(U[sU][A]); \\')
-if GRIDBENCH:   # referencing differs in Grid and GridBench
-    write('    const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\')
-else:
-    write('    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\')
-asmopen()
-#pg1.loadpredication()
-#fetch_base_ptr("&ref[0][0]")
-fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I')
-fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A')
-#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='I')
-#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='A')
-#fetch_base_ptr(F"&ref[0][{FETCH_BASE_PTR_COLOR_OFFSET}]")
-if ASM_LOAD_GAUGE:
-    U_00.load("ref[0][0]")
-    U_10.load("ref[1][0]")
-    U_20.load("ref[2][0]")
-    U_01.load("ref[0][1]")
-    U_11.load("ref[1][1]")
-    U_21.load("ref[2][1]")
-
-if MOVPRFX == False:
-    UChi_00.zero()  # implementation specific
-    UChi_10.zero()
-    UChi_01.zero()
-    UChi_11.zero()
-    UChi_02.zero()
-    UChi_12.zero()
-
-    # round 1
-    UChi_00.mul0(U_00, Chi_00) # FCMLA latency is 6+6+9 cycles
-    UChi_10.mul0(U_00, Chi_10)
-    UChi_01.mul0(U_10, Chi_00)
-    UChi_11.mul0(U_10, Chi_10)
-    UChi_02.mul0(U_20, Chi_00)
-    UChi_12.mul0(U_20, Chi_10)
-else:
-    # round 1
-    UChi_00.mul0(zero0, U_00, Chi_00, constructive=True) # FCMLA latency is 6+6+9 cycles
-    UChi_10.mul0(zero0, U_00, Chi_10, constructive=True)
-    UChi_01.mul0(zero0, U_10, Chi_00, constructive=True)
-    UChi_11.mul0(zero0, U_10, Chi_10, constructive=True)
-    UChi_02.mul0(zero0, U_20, Chi_00, constructive=True)
-    UChi_12.mul0(zero0, U_20, Chi_10, constructive=True)
-
-# round 2
-UChi_00.mul1(U_00, Chi_00)
-UChi_10.mul1(U_00, Chi_10)
-UChi_01.mul1(U_10, Chi_00)
-UChi_11.mul1(U_10, Chi_10)
-UChi_02.mul1(U_20, Chi_00)
-UChi_12.mul1(U_20, Chi_10)  # Chi_00 and Chi_10 available from here
-
-if ASM_LOAD_GAUGE:
-    U_00.load("ref[0][2]")      # U_00, U_10, U_20 overloaded
-    U_10.load("ref[1][2]")      # early load
-    U_20.load("ref[2][2]")      # A -->
-asmclose()
-debugall('MULT_2SPIN_1', group='UChi')
-curlyclose()
-newline()
-
-write('// MULT_2SPIN_BACKEND')
-definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}')
-curlyopen()
-asmopen()
-# round 3
-UChi_00.mac0(U_01, Chi_01)  # armclang separates fcmla(..., 0) and
-UChi_10.mac0(U_01, Chi_11)  #                    fcmla(..., 90)
-UChi_01.mac0(U_11, Chi_01)  # autonomously using intrinsics
-UChi_11.mac0(U_11, Chi_11)
-UChi_02.mac0(U_21, Chi_01)
-UChi_12.mac0(U_21, Chi_11)
-# round 4
-UChi_00.mac1(U_01, Chi_01)
-UChi_10.mac1(U_01, Chi_11)
-UChi_01.mac1(U_11, Chi_01)
-UChi_11.mac1(U_11, Chi_11)
-UChi_02.mac1(U_21, Chi_01)
-UChi_12.mac1(U_21, Chi_11)
-# round 5
-UChi_00.mac0(U_00, Chi_02)  # <-- A
-UChi_10.mac0(U_00, Chi_12)
-UChi_01.mac0(U_10, Chi_02)
-UChi_11.mac0(U_10, Chi_12)
-UChi_02.mac0(U_20, Chi_02)
-UChi_12.mac0(U_20, Chi_12)
-# round 6
-UChi_00.mac1(U_00, Chi_02)
-UChi_10.mac1(U_00, Chi_12)
-UChi_01.mac1(U_10, Chi_02)
-UChi_11.mac1(U_10, Chi_12)
-UChi_02.mac1(U_20, Chi_02)
-UChi_12.mac1(U_20, Chi_12)
-asmclose()
-debugall('MULT_2SPIN_2', group='UChi')
-curlyclose()
-newline()
-
-
-#//      hspin(0)=fspin(0)+timesI(fspin(3));
-#//      hspin(1)=fspin(1)+timesI(fspin(2));
-d['factor'] = 1
-# FCADD issue latency = 1, latency is 6+9
-d['cycles_PROJ'] += 15 * d['factor']
-write('// XP_PROJ')
-definemultiline(F'XP_PROJ_{PRECSUFFIX}')
-if ALTERNATIVE_LOADS == True:
-    write('    LOAD_CHIMU_0312_PLUG \\')
-curlyopen()
-asmopen()
-#pg1.loadpredication()
-Chi_00.addTimesI(Chimu_00, Chimu_30)
-Chi_01.addTimesI(Chimu_01, Chimu_31)
-Chi_02.addTimesI(Chimu_02, Chimu_32)
-Chi_10.addTimesI(Chimu_10, Chimu_20)
-Chi_11.addTimesI(Chimu_11, Chimu_21)
-Chi_12.addTimesI(Chimu_12, Chimu_22)
-asmclose()
-debugall('XP_PROJ', group='Chi')
-curlyclose()
-newline()
-
-#//      fspin(0)=hspin(0);
-#//      fspin(1)=hspin(1);
-#//      fspin(2)=timesMinusI(hspin(1));
-#//      fspin(3)=timesMinusI(hspin(0));
-# does not occur in GridBench
-d['factor'] = 0
-d['cycles_RECON'] += 15 * d['factor']
-write('// XP_RECON')
-definemultiline(F'XP_RECON_{PRECSUFFIX}')
-asmopen()
-#pg1.loadpredication()
-if MOVPRFX == False:
-    result_20.zero()
-    result_21.zero()
-    result_22.zero()
-    result_30.zero()
-    result_31.zero()
-    result_32.zero()
-
-    result_20.subTimesI(UChi_10)
-    result_21.subTimesI(UChi_11)
-    result_22.subTimesI(UChi_12)
-    result_30.subTimesI(UChi_00)
-    result_31.subTimesI(UChi_01)
-    result_32.subTimesI(UChi_02)
-else:
-    result_20.subTimesI(zero0, UChi_10, constructive=True)
-    result_21.subTimesI(zero0, UChi_11, constructive=True)
-    result_22.subTimesI(zero0, UChi_12, constructive=True)
-    result_30.subTimesI(zero0, UChi_00, constructive=True)
-    result_31.subTimesI(zero0, UChi_01, constructive=True)
-    result_32.subTimesI(zero0, UChi_02, constructive=True)
-
-result_00.move(UChi_00) # don't reorder !
-result_01.move(UChi_01)
-result_02.move(UChi_02)
-result_10.move(UChi_10)
-result_11.move(UChi_11)
-result_12.move(UChi_12)
-
-# result_00.add(UChi_00)   # faster than move?
-# result_01.add(UChi_01)
-# result_02.add(UChi_02)
-# result_10.add(UChi_10)
-# result_11.add(UChi_11)
-# result_12.add(UChi_12)
-asmclose()
-debugall('XP_RECON', group='result')
-newline()
-
-
-d['factor'] = 1
-# FCADD issue latency = 1, latency is 6+9
-d['cycles_RECON'] += 15 * d['factor']
-write('// XP_RECON_ACCUM')
-definemultiline(F'XP_RECON_ACCUM_{PRECSUFFIX}')
-asmopen()
-#pg1.loadpredication()
-# result_20.subTimesI(UChi_10)
-# result_21.subTimesI(UChi_11)
-# result_22.subTimesI(UChi_12)
-# result_30.subTimesI(UChi_00)
-# result_31.subTimesI(UChi_01)
-# result_32.subTimesI(UChi_02)
-#
-# result_00.add(UChi_00) # reordered
-# result_01.add(UChi_01)
-# result_02.add(UChi_02)
-# result_10.add(UChi_10)
-# result_11.add(UChi_11)
-# result_12.add(UChi_12)
-
-result_30.subTimesI(UChi_00)    # reordered
-result_00.add(UChi_00)
-
-result_31.subTimesI(UChi_01)
-result_01.add(UChi_01)
-
-result_32.subTimesI(UChi_02)
-result_02.add(UChi_02)
-
-result_20.subTimesI(UChi_10)
-result_10.add(UChi_10)
-
-result_21.subTimesI(UChi_11)
-result_11.add(UChi_11)
-
-result_22.subTimesI(UChi_12)
-result_12.add(UChi_12)
-asmclose()
-debugall('XP_RECON_ACCUM', group='result')
-newline()
-
-d['factor'] = 1
-# add/sub issue latency = 1, latency is 9
-d['cycles_PROJ'] += 9 * d['factor']
-write('// YP_PROJ')
-definemultiline(F'YP_PROJ_{PRECSUFFIX}')
-if ALTERNATIVE_LOADS == True:
-    write('    LOAD_CHIMU_0312_PLUG \\')
-curlyopen()
-asmopen()
-#pg1.loadpredication()
-Chi_00.sub(Chimu_00, Chimu_30)
-Chi_01.sub(Chimu_01, Chimu_31)
-Chi_02.sub(Chimu_02, Chimu_32)
-Chi_10.add(Chimu_10, Chimu_20)
-Chi_11.add(Chimu_11, Chimu_21)
-Chi_12.add(Chimu_12, Chimu_22)
-asmclose()
-debugall('YP_PROJ', group='Chi')
-curlyclose()
-newline()
-
-d['factor'] = 1
-# FCADD issue latency = 1, latency is 6+9
-d['cycles_PROJ'] += 15 * d['factor']
-write('// ZP_PROJ')
-definemultiline(F'ZP_PROJ_{PRECSUFFIX}')
-if ALTERNATIVE_LOADS == True:
-    write('    LOAD_CHIMU_0213_PLUG \\')
-curlyopen()
-asmopen()
-#pg1.loadpredication()
-Chi_00.addTimesI(Chimu_00, Chimu_20)
-Chi_01.addTimesI(Chimu_01, Chimu_21)
-Chi_02.addTimesI(Chimu_02, Chimu_22)
-Chi_10.subTimesI(Chimu_10, Chimu_30)
-Chi_11.subTimesI(Chimu_11, Chimu_31)
-Chi_12.subTimesI(Chimu_12, Chimu_32)
-asmclose()
-debugall('ZP_PROJ', group='Chi')
-curlyclose()
-newline()
-
-d['factor'] = 1
-# add/sub issue latency = 1, latency is 9
-d['cycles_PROJ'] += 9 * d['factor']
-write('// TP_PROJ')
-definemultiline(F'TP_PROJ_{PRECSUFFIX}')
-if ALTERNATIVE_LOADS == True:
-    write('    LOAD_CHIMU_0213_PLUG \\')
-curlyopen()
-asmopen()
-#pg1.loadpredication()
-Chi_00.add(Chimu_00, Chimu_20)
-Chi_01.add(Chimu_01, Chimu_21)
-Chi_02.add(Chimu_02, Chimu_22)
-Chi_10.add(Chimu_10, Chimu_30)
-Chi_11.add(Chimu_11, Chimu_31)
-Chi_12.add(Chimu_12, Chimu_32)
-asmclose()
-debugall('TP_PROJ', group='Chi')
-curlyclose()
-newline()
-
-#//      hspin(0)=fspin(0)-timesI(fspin(3));
-#//      hspin(1)=fspin(1)-timesI(fspin(2));
-
-d['factor'] = 1
-# FCADD issue latency = 1, latency is 6+9
-d['cycles_PROJ'] += 15 * d['factor']
-write('// XM_PROJ')
-definemultiline(F'XM_PROJ_{PRECSUFFIX}')
-if ALTERNATIVE_LOADS == True:
-    write('    LOAD_CHIMU_0312_PLUG \\')
-curlyopen()
-asmopen()
-#pg1.loadpredication()
-Chi_00.subTimesI(Chimu_00, Chimu_30)
-Chi_01.subTimesI(Chimu_01, Chimu_31)
-Chi_02.subTimesI(Chimu_02, Chimu_32)
-Chi_10.subTimesI(Chimu_10, Chimu_20)
-Chi_11.subTimesI(Chimu_11, Chimu_21)
-Chi_12.subTimesI(Chimu_12, Chimu_22)
-asmclose()
-debugall('XM_PROJ sub', group='Chi')
-curlyclose()
-newline()
-
-d['factor'] = 1
-d['cycles_RECON'] += 15 * d['factor']
-write('// XM_RECON')
-definemultiline(F'XM_RECON_{PRECSUFFIX}')
-asmopen()
-#pg1.loadpredication()
-
-# only necessary if not zeroed before
-if MOVPRFX == False:
-    result_20.zero()
-    result_21.zero()
-    result_22.zero()
-    result_30.zero()
-    result_31.zero()
-    result_32.zero()
-
-    result_20.addTimesI(UChi_10) # <--
-    result_21.addTimesI(UChi_11)
-    result_22.addTimesI(UChi_12)
-    result_30.addTimesI(UChi_00)
-    result_31.addTimesI(UChi_01)
-    result_32.addTimesI(UChi_02)
-else:
-    result_20.addTimesI(zero0, UChi_10, constructive=True) # <--
-    result_21.addTimesI(zero0, UChi_11, constructive=True)
-    result_22.addTimesI(zero0, UChi_12, constructive=True)
-    result_30.addTimesI(zero0, UChi_00, constructive=True)
-    result_31.addTimesI(zero0, UChi_01, constructive=True)
-    result_32.addTimesI(zero0, UChi_02, constructive=True)
-
-result_00.move(UChi_00)
-result_01.move(UChi_01)
-result_02.move(UChi_02)
-result_10.move(UChi_10)
-result_11.move(UChi_11)
-result_12.move(UChi_12)
-asmclose()
-debugall('XM_RECON result', group='result')
-newline()
-
-d['factor'] = 1
-# add/sub issue latency = 1, latency is 9
-d['cycles_PROJ'] += 9 * d['factor']
-write('// YM_PROJ')
-definemultiline(F'YM_PROJ_{PRECSUFFIX}')
-if ALTERNATIVE_LOADS == True:
-    write('    LOAD_CHIMU_0312_PLUG \\')
-curlyopen()
-asmopen()
-#pg1.loadpredication()
-Chi_00.add(Chimu_00, Chimu_30)
-Chi_01.add(Chimu_01, Chimu_31)
-Chi_02.add(Chimu_02, Chimu_32)
-Chi_10.sub(Chimu_10, Chimu_20)
-Chi_11.sub(Chimu_11, Chimu_21)
-Chi_12.sub(Chimu_12, Chimu_22)
-asmclose()
-debugall('YM_PROJ', group='Chi')
-curlyclose()
-newline()
-
-d['factor'] = 1
-# FCADD issue latency = 1, latency is 6+9
-d['cycles_PROJ'] += 15 * d['factor']
-write('// ZM_PROJ')
-definemultiline(F'ZM_PROJ_{PRECSUFFIX}')
-if ALTERNATIVE_LOADS == True:
-    write('    LOAD_CHIMU_0213_PLUG \\')
-curlyopen()
-asmopen()
-#pg1.loadpredication()
-Chi_00.subTimesI(Chimu_00, Chimu_20)
-Chi_01.subTimesI(Chimu_01, Chimu_21)
-Chi_02.subTimesI(Chimu_02, Chimu_22)
-Chi_10.addTimesI(Chimu_10, Chimu_30)
-Chi_11.addTimesI(Chimu_11, Chimu_31)
-Chi_12.addTimesI(Chimu_12, Chimu_32)
-asmclose()
-debugall('ZM_PROJ', group='Chi')
-curlyclose()
-newline()
-
-d['factor'] = 1
-# add/sub issue latency = 1, latency is 9
-d['cycles_PROJ'] += 9 * d['factor']
-write('// TM_PROJ')
-definemultiline(F'TM_PROJ_{PRECSUFFIX}')
-if ALTERNATIVE_LOADS == True:
-    write('    LOAD_CHIMU_0213_PLUG \\')
-curlyopen()
-asmopen()
-pg1.loadpredication()
-Chi_00.sub(Chimu_00, Chimu_20)
-Chi_01.sub(Chimu_01, Chimu_21)
-Chi_02.sub(Chimu_02, Chimu_22)
-Chi_10.sub(Chimu_10, Chimu_30)
-Chi_11.sub(Chimu_11, Chimu_31)
-Chi_12.sub(Chimu_12, Chimu_32)
-asmclose()
-debugall('TM_PROJ', group='Chi')
-curlyclose()
-newline()
-
-# does not occur in GridBench
-d['factor'] = 0
-# add/sub issue latency = 1, latency is 9
-d['cycles_RECON'] += 15 * d['factor']
-write('// XM_RECON_ACCUM')
-definemultiline(F'XM_RECON_ACCUM_{PRECSUFFIX}')
-asmopen()
-# result_20.addTimesI(UChi_10)
-# result_21.addTimesI(UChi_11)
-# result_22.addTimesI(UChi_12)
-# result_30.addTimesI(UChi_00)
-# result_31.addTimesI(UChi_01)
-# result_32.addTimesI(UChi_02)
-#
-# # result_00.move(UChi_00)
-# # result_01.move(UChi_01)
-# # result_02.move(UChi_02)
-# # result_10.move(UChi_10)
-# # result_11.move(UChi_11)
-# # result_12.move(UChi_12)
-#
-# # faster than move ?
-# result_00.add(UChi_00)
-# result_01.add(UChi_01)
-# result_02.add(UChi_02)
-# result_10.add(UChi_10)
-# result_11.add(UChi_11)
-# result_12.add(UChi_12)
-
-result_30.addTimesI(UChi_00)    # reordered
-result_31.addTimesI(UChi_01)
-result_32.addTimesI(UChi_02)
-
-result_20.addTimesI(UChi_10)
-result_21.addTimesI(UChi_11)
-result_22.addTimesI(UChi_12)
-
-result_00.add(UChi_00)
-result_01.add(UChi_01)
-result_02.add(UChi_02)
-result_10.add(UChi_10)
-result_11.add(UChi_11)
-result_12.add(UChi_12)
-asmclose()
-debugall('XM_RECON_ACCUM', group='result')
-newline()
-
-
-
-d['factor'] = 1
-d['cycles_RECON'] += 9 * d['factor']
-write('// YP_RECON_ACCUM')
-definemultiline(F'YP_RECON_ACCUM_{PRECSUFFIX}')
-asmopen()
-#pg1.loadpredication()
-# result_00.add(UChi_00)
-# result_01.add(UChi_01)
-# result_02.add(UChi_02)
-# result_10.add(UChi_10)
-# result_11.add(UChi_11)
-# result_12.add(UChi_12)
-# result_20.add(UChi_10)
-# result_21.add(UChi_11)
-# result_22.add(UChi_12)
-# result_30.sub(UChi_00)
-# result_31.sub(UChi_01)
-# result_32.sub(UChi_02)
-
-result_00.add(UChi_00)  # reordered
-result_30.sub(UChi_00)
-
-result_01.add(UChi_01)
-result_31.sub(UChi_01)
-
-result_02.add(UChi_02)
-result_32.sub(UChi_02)
-
-result_10.add(UChi_10)
-result_20.add(UChi_10)
-
-result_11.add(UChi_11)
-result_21.add(UChi_11)
-
-result_12.add(UChi_12)
-result_22.add(UChi_12)
-asmclose()
-debugall('YP_RECON_ACCUM', group='result')
-newline()
-
-d['factor'] = 1
-d['cycles_RECON'] += 9 * d['factor']
-write('// YM_RECON_ACCUM')
-definemultiline(F'YM_RECON_ACCUM_{PRECSUFFIX}')
-asmopen()
-#pg1.loadpredication()
-# result_00.add(UChi_00)
-# result_01.add(UChi_01)
-# result_02.add(UChi_02)
-# result_10.add(UChi_10)
-# result_11.add(UChi_11)
-# result_12.add(UChi_12)
-# result_20.sub(UChi_10)
-# result_21.sub(UChi_11)
-# result_22.sub(UChi_12)
-# result_30.add(UChi_00)
-# result_31.add(UChi_01)
-# result_32.add(UChi_02)
-
-result_00.add(UChi_00)  # reordered
-result_30.add(UChi_00)
-
-result_01.add(UChi_01)
-result_31.add(UChi_01)
-
-result_02.add(UChi_02)
-result_32.add(UChi_02)
-
-result_10.add(UChi_10)
-result_20.sub(UChi_10)
-
-result_11.add(UChi_11)
-result_21.sub(UChi_11)
-
-result_12.add(UChi_12)
-result_22.sub(UChi_12)
-asmclose()
-debugall('YM_RECON_ACCUM', group='result')
-newline()
-
-d['factor'] = 1
-d['cycles_RECON'] += 15 * d['factor']
-write('// ZP_RECON_ACCUM')
-definemultiline(F'ZP_RECON_ACCUM_{PRECSUFFIX}')
-asmopen()
-#pg1.loadpredication()
-# result_20.subTimesI(UChi_00)
-# result_21.subTimesI(UChi_01)
-# result_22.subTimesI(UChi_02)
-# result_30.addTimesI(UChi_10)
-# result_31.addTimesI(UChi_11)
-# result_32.addTimesI(UChi_12)
-#
-# result_00.add(UChi_00)
-# result_01.add(UChi_01)
-# result_02.add(UChi_02)
-# result_10.add(UChi_10)
-# result_11.add(UChi_11)
-# result_12.add(UChi_12)
-result_20.subTimesI(UChi_00)    # reordered
-result_00.add(UChi_00)
-
-result_21.subTimesI(UChi_01)
-result_01.add(UChi_01)
-
-result_22.subTimesI(UChi_02)
-result_02.add(UChi_02)
-
-result_30.addTimesI(UChi_10)
-result_10.add(UChi_10)
-
-result_31.addTimesI(UChi_11)
-result_11.add(UChi_11)
-
-result_32.addTimesI(UChi_12)
-result_12.add(UChi_12)
-asmclose()
-debugall('ZP_RECON_ACCUM', group='result')
-newline()
-
-d['factor'] = 1
-d['cycles_RECON'] += 15 * d['factor']
-write('// ZM_RECON_ACCUM')
-definemultiline(F'ZM_RECON_ACCUM_{PRECSUFFIX}')
-asmopen()
-#pg1.loadpredication()
-# result_20.addTimesI(UChi_00)
-# result_21.addTimesI(UChi_01)
-# result_22.addTimesI(UChi_02)
-# result_30.subTimesI(UChi_10)
-# result_31.subTimesI(UChi_11)
-# result_32.subTimesI(UChi_12)
-#
-# result_00.add(UChi_00)
-# result_01.add(UChi_01)
-# result_02.add(UChi_02)
-# result_10.add(UChi_10)
-# result_11.add(UChi_11)
-# result_12.add(UChi_12)
-result_20.addTimesI(UChi_00)    # reordered
-result_00.add(UChi_00)
-
-result_21.addTimesI(UChi_01)
-result_01.add(UChi_01)
-
-result_22.addTimesI(UChi_02)
-result_02.add(UChi_02)
-
-result_30.subTimesI(UChi_10)
-result_10.add(UChi_10)
-
-result_31.subTimesI(UChi_11)
-result_11.add(UChi_11)
-
-result_32.subTimesI(UChi_12)
-result_12.add(UChi_12)
-asmclose()
-debugall('ZM_RECON_ACCUM', group='result')
-newline()
-
-d['factor'] = 1
-d['cycles_RECON'] += 9 * d['factor']
-write('// TP_RECON_ACCUM')
-definemultiline(F'TP_RECON_ACCUM_{PRECSUFFIX}')
-asmopen()
-#pg1.loadpredication()
-# result_00.add(UChi_00)
-# result_01.add(UChi_01)
-# result_02.add(UChi_02)
-# result_10.add(UChi_10)
-# result_11.add(UChi_11)
-# result_12.add(UChi_12)
-# result_20.add(UChi_00)
-# result_21.add(UChi_01)
-# result_22.add(UChi_02)
-# result_30.add(UChi_10)
-# result_31.add(UChi_11)
-# result_32.add(UChi_12)
-
-result_00.add(UChi_00)  # reordered
-result_20.add(UChi_00)
-
-result_01.add(UChi_01)
-result_21.add(UChi_01)
-
-result_02.add(UChi_02)
-result_22.add(UChi_02)
-
-result_10.add(UChi_10)
-result_30.add(UChi_10)
-
-result_11.add(UChi_11)
-result_31.add(UChi_11)
-
-result_12.add(UChi_12)
-result_32.add(UChi_12)
-asmclose()
-debugall('TP_RECON_ACCUM', group='result')
-newline()
-
-d['factor'] = 1
-d['cycles_RECON'] += 9 * d['factor']
-write('// TM_RECON_ACCUM')
-definemultiline(F'TM_RECON_ACCUM_{PRECSUFFIX}')
-asmopen()
-#pg1.loadpredication()
-# result_00.add(UChi_00)
-# result_01.add(UChi_01)
-# result_02.add(UChi_02)
-# result_10.add(UChi_10)
-# result_11.add(UChi_11)
-# result_12.add(UChi_12)
-# result_20.sub(UChi_00)
-# result_21.sub(UChi_01)
-# result_22.sub(UChi_02)
-# result_30.sub(UChi_10)
-# result_31.sub(UChi_11)
-# result_32.sub(UChi_12)
-
-result_00.add(UChi_00)  # reordered
-result_20.sub(UChi_00)
-
-result_01.add(UChi_01)
-result_21.sub(UChi_01)
-
-result_02.add(UChi_02)
-result_22.sub(UChi_02)
-
-result_10.add(UChi_10)
-result_30.sub(UChi_10)
-
-result_11.add(UChi_11)
-result_31.sub(UChi_11)
-
-result_12.add(UChi_12)
-result_32.sub(UChi_12)
-asmclose()
-debugall('TM_RECON_ACCUM', group='result')
-newline()
-
-d['factor'] = 0
-# have 12 instructions
-# picking dual issue versions
-d['cycles_ZERO_PSI'] += 6 * d['factor']
-write('// ZERO_PSI')
-definemultiline(F'ZERO_PSI_{PRECSUFFIX}')
-asmopen()
-pg1.loadpredication()
-result_00.zero()
-result_01.zero()
-result_02.zero()
-result_10.zero()
-result_11.zero()
-result_12.zero()
-result_20.zero()
-result_21.zero()
-result_22.zero()
-result_30.zero()
-result_31.zero()
-result_32.zero()
-asmclose()
-#debugall('ZERO_PSI', group='result')
-newline()
-
-# prefetch store spinors to L2 cache
-d['factor'] = 0
-d['cycles_PREFETCH_L2'] += 0 * d['factor']
-write('// PREFETCH_RESULT_L2_STORE (prefetch store to L2)')
-definemultiline(F'PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(base)')
-curlyopen()
-fetch_base_ptr(F"base")
-asmopen()
-fetch_base_ptr(F"base", target='A')
-prefetch_L2_store(F"base", 0)
-prefetch_L2_store(F"base", 1)
-prefetch_L2_store(F"base", 2)
-asmclose()
-curlyclose()
-newline()
-
-# prefetch store spinors to L1 cache
-d['factor'] = 0
-d['cycles_PREFETCH_L1'] += 0 * d['factor']
-write('// PREFETCH_RESULT_L1_STORE (prefetch store to L1)')
-definemultiline(F'PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(base)')
-curlyopen()
-fetch_base_ptr(F"base")
-asmopen()
-fetch_base_ptr(F"base", target='A')
-prefetch_L1_store(F"base", 0)
-prefetch_L1_store(F"base", 1)
-prefetch_L1_store(F"base", 2)
-asmclose()
-curlyclose()
-newline()
-
-
-d['factor'] = 0
-write('// ADD_RESULT_INTERNAL')
-definemultiline(F'ADD_RESULT_INTERNAL_{PRECSUFFIX}')
-asmopen()
-result_00.add(Chimu_00)
-result_01.add(Chimu_01)
-result_02.add(Chimu_02)
-result_10.add(Chimu_10)
-result_11.add(Chimu_11)
-result_12.add(Chimu_12)
-result_20.add(Chimu_20)
-result_21.add(Chimu_21)
-result_22.add(Chimu_22)
-result_30.add(Chimu_30)
-result_31.add(Chimu_31)
-result_32.add(Chimu_32)
-asmclose()
-#debugall('ZERO_PSI', group='result')
-newline()
-
-# --------------------------------------------------------------------------------
-
-# C
-f = open('w.h', 'w')
-f.write(d['C'])
-f.close()
-
-# intrin
-f = open('wi.h', 'w')
-f.write(d['I'])
-f.close()
-
-filename = ''
-if PRECISION == 'double':
-    filename = "Fujitsu_A64FX_intrin_double.h"
-else:
-    filename = "Fujitsu_A64FX_intrin_single.h"
-f = open(filename, 'w')
-f.write(LEGAL.format(filename))
-f.write(d['I'])
-f.close()
-
-
-# asm
-f = open('wa.h', 'w')
-f.write(d['A'])
-f.close()
-
-filename = ''
-if PRECISION == 'double':
-    filename = "Fujitsu_A64FX_asm_double.h"
-else:
-    filename = "Fujitsu_A64FX_asm_single.h"
-f = open(filename, 'w')
-f.write(LEGAL.format(filename))
-f.write(d['A'])
-f.close()
-
-
-# arithmetics instruction count, mul/mac = 2 instructions each
-d['acount'] = d['add'] + d['sub'] + \
-    d['mul'] + d['mac'] + d['addTimesI'] + d['subTimesI']
-
-# permutations
-d['permutes'] += 2*d['timesI'] + 1*d['timesMinusI']
-d['neg'] = 1*d['timesI'] + 1*d['timesMinusI']
-
-# instruction count,  mul/mac = 2 instructions each, +/- *i = 3 instructions each
-d['icount'] = d['load'] + d['store'] + d['move'] + d['add'] + d['sub'] + \
-    d['mul'] + d['mac'] + d['permutes'] + d['neg'] + \
-    d['addTimesI'] + d['subTimesI'] + d['zero'] + d['movprfx']
-
-# flops
-d['flops'] = 4*d['mac'] + 3*d['mul'] + d['add'] + d['sub'] + \
-    d['addTimesI'] + d['subTimesI']
-
-
-
-
-
-print('Statistics')
-print('')
-print('Type                     Occurences      Total / Arith instructions')
-print('-------------------------------------------------------------------')
-print('Variables                {:4d}'.format(d['registers']))
-print('')
-print('load                     {:4d}'.format(d['load']))
-print('store                    {:4d}'.format(d['store']))
-print('move                     {:4d}'.format(d['move']))
-print('movprfx                  {:4d}'.format(d['movprfx']))
-print('zero                     {:4d}'.format(d['zero']))
-print('negate                   {:4d}'.format(d['neg']))
-
-
-print('add                      {:4d}              {:0.2f} / {:0.2f}'.\
-    format(d['add'], d['add'] /   d['icount'], d['add'] /   d['acount']))
-print('sub                      {:4d}              {:0.2f} / {:0.2f}'.\
-    format(d['sub'], d['sub'] /   d['icount'], d['sub'] /   d['acount']))
-print('mul                      {:4d}              {:0.2f} / {:0.2f}'.\
-    format(d['mul'], 2*d['mul'] / d['icount'], 2*d['mul'] /   d['acount']))
-print('mac                      {:4d}              {:0.2f} / {:0.2f}'.\
-    format(d['mac'], 2*d['mac'] / d['icount'], 2*d['mac'] /   d['acount']))
-print('addTimesI                {:4d}              {:0.2f} / {:0.2f}'.\
-    format(d['addTimesI'], 2*d['addTimesI'] / d['icount'], 2*d['addTimesI'] / d['acount']))
-print('subTimesI                {:4d}              {:0.2f} / {:0.2f}'.\
-    format(d['subTimesI'], 2*d['subTimesI'] / d['icount'], 2*d['subTimesI'] / d['acount']))
-
-print('timesI                   {:4d}'.format(d['timesI']))
-print('timesMinusI              {:4d}'.format(d['timesMinusI']))
-print('permutes                 {:4d}              {:0.2f}'.\
-    format(d['permutes'], d['permutes'] / d['icount']))
-print('')
-print('flops                    {:4d}'.format(d['flops']))
-print('instruction count        {:4d}'.format(d['icount']))
-print('arith. instruction count {:4d}              {:0.2f}'.\
-    format(d['acount'], d['acount'] / d['icount']))
-
-
-# ---- static pipeline resources consumption ----
-FLA = 0
-FLA += 2 * d['mac'] + 2 * d['mul']
-FLA += 1 * d['addTimesI'] + 1 * d['subTimesI']
-FLA += 1 * d['move']
-FLA += 1 * d['permutes']
-FLA += 1 * d['store']
-FLA += 1 * d['zero']
-
-FLB = 0
-FLB += 1 * d['addTimesI'] + 1 * d['subTimesI']
-
-FLAB = 0
-FLAB += 1 * d['mac'] + 1 * d['mul']
-FLAB += 1 * d['add'] + 1 * d['sub']
-FLAB += 1 * d['neg'] + 1 * d['movprfx']
-#FLAB += 1 * d['zero']
-
-
-FL_slots = 2 * d['icount']
-FL_micro_ops = FLA + FLB + FLAB
-
-print('')
-print('------------------------------------------------------------------')
-print('')
-print('Static FL slot usage')
-print('')
-print('  FLA                      {:4d}'.format(FLA))
-print('  FLB                      {:4d}'.format(FLB))
-print('  FLA/B                    {:4d}'.format(FLAB))
-
-print('')
-print('Static FL slot efficiency')
-print('')
-print('  Total FL slots           {:4d}'.format(FL_slots))
-print('  FL slots occupied        {:4d}'.format(FL_micro_ops))
-print('  FL slot efficiency       {:0.2f}'.format(FL_micro_ops / FL_slots))
-
-cycles_total = d['cycles_ZERO_PSI'] + d['cycles_LOAD_CHIMU'] + \
-    d['cycles_PROJ'] + d['cycles_PERM'] + d['cycles_MULT_2SPIN'] + \
-    d['cycles_RECON'] + d['cycles_RESULT']
-cycles_total_hidden = d['cycles_ZERO_PSI'] + \
-    d['cycles_PROJ'] + d['cycles_MULT_2SPIN'] + \
-    d['cycles_RECON']
-
-# ---- dynamic estimate ----
-
-print('')
-print('Dynamic cycles estimate (incl. latencies)')
-print('')
-print('  ZERO_PSI                 {:4d}'.format(d['cycles_ZERO_PSI']))
-print('  LOAD_CHIMU               {:4d}'.format(d['cycles_LOAD_CHIMU']))
-print('  PROJ                     {:4d}'.format(d['cycles_PROJ']))
-print('  PERM                     {:4d}'.format(d['cycles_PERM']))
-print('  MULT_2SPIN               {:4d}'.format(d['cycles_MULT_2SPIN']))
-print('  RECON                    {:4d}'.format(d['cycles_RECON']))
-print('  STORE                    {:4d}'.format(d['cycles_RESULT']))
-print('')
-print('  Sum                      {:4d}'.format(cycles_total))
-print('')
-print('  Sum*                     {:4d}'.format(cycles_total_hidden))
-print('  Total FL slots*          {:4d}'.format(cycles_total_hidden * 2))
-print('  FL slots occupied*       {:4d}'.format(FL_micro_ops))
-print('  FL slot efficiency*      {:0.2f}'.format(FL_micro_ops / (2*cycles_total_hidden)))
-print('')
-print('  *load/store/PERM hidden')
-
-estimated_cycles = cycles_total_hidden
-# Estimate percent peak DP; dual issue, fma
-pp = 100 * 4 * d['flops'] / (2*2*8*estimated_cycles)
-print('')
-print('Model prediction')
-print('')
-print('  Cycles*                  {:4d}'.format(estimated_cycles))
-print('  Percent peak*            {:4.1f} %'.format(pp))
-
-# estimated RF throughput in GB/s @ 2.2 GHz
-tp10 = (d['load'] + d['store']) * 64 * 2.2 / estimated_cycles
-tp2  = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / estimated_cycles
-print('')
-print('  Estimated RF throughput* {:4.1f}      GB/s'.\
-    format(tp10))
-print('  Estimated RF throughput* {:4.1f}      GiB/s'.\
-    format(tp2))
-
-# ---- dynamic pipeline resources consumption ----
-
-runtime = measured_cycles  # runtime in cycles
-pp_runtime = 100 * 4 * d['flops'] / (2*2*8*runtime)
-runtime_FL_slots = 2 * runtime
-delta = runtime - estimated_cycles
-
-
-print('')
-print('------------------------------------------------------------------')
-print('')
-print('Dynamic runtime analysis (cycles from measurements)')
-print('')
-print('  Cycles                   {:4d}'.format(runtime))
-print('  Percent peak             {:4.1f} %'.format(pp_runtime))
-print('  Deviation from estimate  {:4d}               {:4.2f} %'.\
-    format(delta, 100. * abs(delta/runtime)))
-print('  Deviation per direction  {:4.1f}'.format(delta/8))
-
-# estimated RF throughput in GB/s @ 2.2 GHz
-tp10_rt = (d['load'] + d['store']) * 64 * 2.2 / runtime
-tp2_rt  = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / runtime
-print('')
-print('  RF throughput            {:4.1f}      GB/s'.\
-    format(tp10_rt))
-print('  RF throughput            {:4.1f}      GiB/s'.\
-    format(tp2_rt))
-print('')
-print('  Total FL slots           {:4d}'.format(runtime_FL_slots))
-print('  FL slots occupied        {:4d}'.format(FL_micro_ops))
-print('  FL slot efficiency       {:0.2f}'.format(FL_micro_ops / runtime_FL_slots))
-print('')

From 909acd55cd36c4b567cab30d311aab6b8674288d Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Sat, 19 Dec 2020 02:00:22 +0100
Subject: [PATCH 03/16] vnum variant for prefetches

---
 Grid/simd/Fujitsu_A64FX_intrin_double.h | 36 ++++++++++++-------------
 Grid/simd/Fujitsu_A64FX_intrin_single.h | 36 ++++++++++++-------------
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h
index 361246fc..f195e3c5 100644
--- a/Grid/simd/Fujitsu_A64FX_intrin_double.h
+++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h
@@ -144,38 +144,38 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // PREFETCH_CHIMU_L2 (prefetch to L2)
 #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \
 { \
-    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL2STRM); \
 }
 // PREFETCH_CHIMU_L1 (prefetch to L1)
 #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \
 { \
-    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL1STRM); \
 }
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
 { \
     const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
-    svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)-4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)12), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)16), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)20), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)24), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)28), SV_PLDL2STRM); \
 }
 // PREFETCH_GAUGE_L1 (prefetch to L1)
 #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \
 { \
     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
-    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL1STRM); \
 }
 // LOAD_CHI
 #define LOAD_CHI_A64FXd(base)  \
diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h
index 30273b6e..0b874f02 100644
--- a/Grid/simd/Fujitsu_A64FX_intrin_single.h
+++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h
@@ -144,38 +144,38 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // PREFETCH_CHIMU_L2 (prefetch to L2)
 #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base)  \
 { \
-    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \
 }
 // PREFETCH_CHIMU_L1 (prefetch to L1)
 #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base)  \
 { \
-    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \
 }
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \
 { \
     const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
-    svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \
 }
 // PREFETCH_GAUGE_L1 (prefetch to L1)
 #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \
 { \
     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
-    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \
 }
 // LOAD_CHI
 #define LOAD_CHI_A64FXf(base)  \

From 4b882e8056b2c9dd6dceab2729104e5e615835ae Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Sat, 19 Dec 2020 03:09:20 +0100
Subject: [PATCH 04/16] fixed lost bracket

---
 Grid/simd/Fujitsu_A64FX_intrin_double.h | 36 ++++++++++++-------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h
index f195e3c5..b645c365 100644
--- a/Grid/simd/Fujitsu_A64FX_intrin_double.h
+++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h
@@ -144,38 +144,38 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // PREFETCH_CHIMU_L2 (prefetch to L2)
 #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \
 { \
-    svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL2STRM); \
-    svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL2STRM); \
-    svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \
 }
 // PREFETCH_CHIMU_L1 (prefetch to L1)
 #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \
 { \
-    svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL1STRM); \
-    svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL1STRM); \
-    svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \
 }
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
 { \
     const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
-    svprfd_vnum(pg1, (void*)(baseU), (int64_t)-4), SV_PLDL2STRM); \
-    svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL2STRM); \
-    svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL2STRM); \
-    svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL2STRM); \
-    svprfd_vnum(pg1, (void*)(baseU), (int64_t)12), SV_PLDL2STRM); \
-    svprfd_vnum(pg1, (void*)(baseU), (int64_t)16), SV_PLDL2STRM); \
-    svprfd_vnum(pg1, (void*)(baseU), (int64_t)20), SV_PLDL2STRM); \
-    svprfd_vnum(pg1, (void*)(baseU), (int64_t)24), SV_PLDL2STRM); \
-    svprfd_vnum(pg1, (void*)(baseU), (int64_t)28), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \
 }
 // PREFETCH_GAUGE_L1 (prefetch to L1)
 #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \
 { \
     const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
-    svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL1STRM); \
-    svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL1STRM); \
-    svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \
 }
 // LOAD_CHI
 #define LOAD_CHI_A64FXd(base)  \

From 6013183361d88fe7179b4fcf6b8321c0621b09ba Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Sat, 19 Dec 2020 03:25:01 +0100
Subject: [PATCH 05/16] removed Asm impls

---
 Grid/simd/Fujitsu_A64FX_asm_double.h | 781 ---------------------------
 Grid/simd/Fujitsu_A64FX_asm_single.h | 781 ---------------------------
 2 files changed, 1562 deletions(-)
 delete mode 100644 Grid/simd/Fujitsu_A64FX_asm_double.h
 delete mode 100644 Grid/simd/Fujitsu_A64FX_asm_single.h

diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h
deleted file mode 100644
index bbc4efe7..00000000
--- a/Grid/simd/Fujitsu_A64FX_asm_double.h
+++ /dev/null
@@ -1,781 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: Fujitsu_A64FX_asm_double.h
-
-    Copyright (C) 2020
-
-Author: Nils Meyer <nils.meyer@ur.de>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#define LOAD_CHIMU(base)               LOAD_CHIMU_INTERLEAVED_A64FXd(base)  
-#define PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)  
-#define PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  
-#define PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)  
-#define PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  
-#define PF_GAUGE(A)  
-#define PREFETCH_RESULT_L2_STORE(A)    PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A)  
-#define PREFETCH_RESULT_L1_STORE(A)    PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A)  
-#define PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)  
-#define PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)  
-#define LOCK_GAUGE(A)  
-#define UNLOCK_GAUGE(A)  
-#define MASK_REGS                      DECLARATIONS_A64FXd  
-#define SAVE_RESULT(A,B)               RESULT_A64FXd(A);  
-#define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXd(Dir)  
-#define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXd  
-#define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)  
-#define ZERO_PSI                       ZERO_PSI_A64FXd  
-#define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)  
-#define XP_PROJ                        XP_PROJ_A64FXd  
-#define YP_PROJ                        YP_PROJ_A64FXd  
-#define ZP_PROJ                        ZP_PROJ_A64FXd  
-#define TP_PROJ                        TP_PROJ_A64FXd  
-#define XM_PROJ                        XM_PROJ_A64FXd  
-#define YM_PROJ                        YM_PROJ_A64FXd  
-#define ZM_PROJ                        ZM_PROJ_A64FXd  
-#define TM_PROJ                        TM_PROJ_A64FXd  
-#define XP_RECON                       XP_RECON_A64FXd  
-#define XM_RECON                       XM_RECON_A64FXd  
-#define XM_RECON_ACCUM                 XM_RECON_ACCUM_A64FXd  
-#define YM_RECON_ACCUM                 YM_RECON_ACCUM_A64FXd  
-#define ZM_RECON_ACCUM                 ZM_RECON_ACCUM_A64FXd  
-#define TM_RECON_ACCUM                 TM_RECON_ACCUM_A64FXd  
-#define XP_RECON_ACCUM                 XP_RECON_ACCUM_A64FXd  
-#define YP_RECON_ACCUM                 YP_RECON_ACCUM_A64FXd  
-#define ZP_RECON_ACCUM                 ZP_RECON_ACCUM_A64FXd  
-#define TP_RECON_ACCUM                 TP_RECON_ACCUM_A64FXd  
-#define PERMUTE_DIR0                   0  
-#define PERMUTE_DIR1                   1  
-#define PERMUTE_DIR2                   2  
-#define PERMUTE_DIR3                   3  
-#define PERMUTE                        PERMUTE_A64FXd;  
-#define LOAD_TABLE(Dir)                if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }  
-#define MAYBEPERM(Dir,perm)            if (Dir != 3) { if (perm) { PERMUTE; } }  
-// DECLARATIONS
-#define DECLARATIONS_A64FXd  \
-    uint64_t baseU; \
-    const uint64_t lut[4][8] = { \
-        {4, 5, 6, 7, 0, 1, 2, 3}, \
-        {2, 3, 0, 1, 6, 7, 4, 5}, \
-        {1, 0, 3, 2, 5, 4, 7, 6}, \
-        {0, 1, 2, 4, 5, 6, 7, 8} };\
-asm ( \
-    "ptrue p5.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-asm ( \
-    "fmov z31.d , 0 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// RESULT
-#define RESULT_A64FXd(base)  \
-{ \
-asm ( \
-    "str z0, [%[storeptr], -6, mul vl] \n\t" \
-    "str z1, [%[storeptr], -5, mul vl] \n\t" \
-    "str z2, [%[storeptr], -4, mul vl] \n\t" \
-    "str z3, [%[storeptr], -3, mul vl] \n\t" \
-    "str z4, [%[storeptr], -2, mul vl] \n\t" \
-    "str z5, [%[storeptr], -1, mul vl] \n\t" \
-    "str z6, [%[storeptr], 0, mul vl] \n\t" \
-    "str z7, [%[storeptr], 1, mul vl] \n\t" \
-    "str z8, [%[storeptr], 2, mul vl] \n\t" \
-    "str z9, [%[storeptr], 3, mul vl] \n\t" \
-    "str z10, [%[storeptr], 4, mul vl] \n\t" \
-    "str z11, [%[storeptr], 5, mul vl] \n\t" \
-    :  \
-    : [storeptr] "r" (base + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_CHIMU_L2 (prefetch to L2)
-#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \
-{ \
-asm ( \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_CHIMU_L1 (prefetch to L1)
-#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \
-{ \
-asm ( \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_GAUGE_L2 (prefetch to L2)
-#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
-{ \
-    const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
-asm ( \
-    "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_GAUGE_L1 (prefetch to L1)
-#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \
-{ \
-    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
-asm ( \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHI
-#define LOAD_CHI_A64FXd(base)  \
-{ \
-asm ( \
-    "ld1d { z12.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
-    "ld1d { z13.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
-    "ld1d { z14.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
-    "ld1d { z15.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
-    "ld1d { z16.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
-    "ld1d { z17.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHIMU
-#define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \
-{ \
-asm ( \
-    "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
-    "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
-    "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
-    "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
-    "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
-    "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
-    "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
-    "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
-    "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
-    "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
-    "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
-    "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHIMU_0213
-#define LOAD_CHIMU_0213_A64FXd  \
-{ \
-    const SiteSpinor & ref(in[offset]); \
-asm ( \
-    "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
-    "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
-    "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
-    "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
-    "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
-    "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
-    "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
-    "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
-    "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
-    "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
-    "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
-    "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (&ref[2][0]) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHIMU_0312
-#define LOAD_CHIMU_0312_A64FXd  \
-{ \
-    const SiteSpinor & ref(in[offset]); \
-asm ( \
-    "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
-    "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
-    "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
-    "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
-    "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
-    "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
-    "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
-    "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
-    "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
-    "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
-    "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
-    "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (&ref[2][0]) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_TABLE0
-#define LOAD_TABLE0  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (0) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_TABLE1
-#define LOAD_TABLE1  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (1) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_TABLE2
-#define LOAD_TABLE2  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (2) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_TABLE3
-#define LOAD_TABLE3  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (3) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// PERMUTE
-#define PERMUTE_A64FXd  \
-asm ( \
-    "tbl z12.d, { z12.d }, z30.d \n\t"  \
-    "tbl z13.d, { z13.d }, z30.d \n\t"  \
-    "tbl z14.d, { z14.d }, z30.d \n\t"  \
-    "tbl z15.d, { z15.d }, z30.d \n\t"  \
-    "tbl z16.d, { z16.d }, z30.d \n\t"  \
-    "tbl z17.d, { z17.d }, z30.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_GAUGE
-#define LOAD_GAUGE(A)  \
-{ \
-    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
-asm ( \
-    "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
-    "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
-    "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
-    "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
-    "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
-    "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// MULT_2SPIN
-#define MULT_2SPIN_1_A64FXd(A)  \
-{ \
-    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
-asm ( \
-    "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
-    "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
-    "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
-    "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
-    "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
-    "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
-    "movprfx z18.d, p5/m, z31.d \n\t" \
-    "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \
-    "movprfx z21.d, p5/m, z31.d \n\t" \
-    "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \
-    "movprfx z19.d, p5/m, z31.d \n\t" \
-    "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \
-    "movprfx z22.d, p5/m, z31.d \n\t" \
-    "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \
-    "movprfx z20.d, p5/m, z31.d \n\t" \
-    "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \
-    "movprfx z23.d, p5/m, z31.d \n\t" \
-    "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \
-    "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \
-    "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \
-    "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \
-    "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \
-    "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \
-    "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \
-    "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
-    "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
-    "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// MULT_2SPIN_BACKEND
-#define MULT_2SPIN_2_A64FXd  \
-{ \
-asm ( \
-    "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
-    "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
-    "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
-    "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \
-    "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \
-    "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \
-    "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \
-    "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \
-    "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \
-    "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \
-    "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \
-    "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \
-    "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \
-    "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \
-    "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \
-    "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \
-    "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \
-    "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \
-    "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \
-    "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \
-    "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \
-    "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \
-    "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
-    "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XP_PROJ
-#define XP_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \
-    "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \
-    "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \
-    "fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \
-    "fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \
-    "fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XP_RECON
-#define XP_RECON_A64FXd  \
-asm ( \
-    "movprfx z6.d, p5/m, z31.d \n\t" \
-    "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
-    "movprfx z7.d, p5/m, z31.d \n\t" \
-    "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
-    "movprfx z8.d, p5/m, z31.d \n\t" \
-    "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
-    "movprfx z9.d, p5/m, z31.d \n\t" \
-    "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
-    "movprfx z10.d, p5/m, z31.d \n\t" \
-    "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
-    "movprfx z11.d, p5/m, z31.d \n\t" \
-    "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
-    "mov z0.d, p5/m, z18.d \n\t" \
-    "mov z1.d, p5/m, z19.d \n\t" \
-    "mov z2.d, p5/m, z20.d \n\t" \
-    "mov z3.d, p5/m, z21.d \n\t" \
-    "mov z4.d, p5/m, z22.d \n\t" \
-    "mov z5.d, p5/m, z23.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// XP_RECON_ACCUM
-#define XP_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YP_PROJ
-#define YP_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fsub z12.d, p5/m, z12.d, z21.d \n\t" \
-    "fsub z13.d, p5/m, z13.d, z22.d \n\t" \
-    "fsub z14.d, p5/m, z14.d, z23.d \n\t" \
-    "fadd z15.d, p5/m, z15.d, z18.d \n\t"  \
-    "fadd z16.d, p5/m, z16.d, z19.d \n\t"  \
-    "fadd z17.d, p5/m, z17.d, z20.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// ZP_PROJ
-#define ZP_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \
-    "fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \
-    "fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \
-    "fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \
-    "fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \
-    "fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// TP_PROJ
-#define TP_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fadd z12.d, p5/m, z12.d, z18.d \n\t"  \
-    "fadd z13.d, p5/m, z13.d, z19.d \n\t"  \
-    "fadd z14.d, p5/m, z14.d, z20.d \n\t"  \
-    "fadd z15.d, p5/m, z15.d, z21.d \n\t"  \
-    "fadd z16.d, p5/m, z16.d, z22.d \n\t"  \
-    "fadd z17.d, p5/m, z17.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XM_PROJ
-#define XM_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \
-    "fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \
-    "fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \
-    "fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \
-    "fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \
-    "fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XM_RECON
-#define XM_RECON_A64FXd  \
-asm ( \
-    "movprfx z6.d, p5/m, z31.d \n\t" \
-    "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
-    "movprfx z7.d, p5/m, z31.d \n\t" \
-    "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
-    "movprfx z8.d, p5/m, z31.d \n\t" \
-    "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
-    "movprfx z9.d, p5/m, z31.d \n\t" \
-    "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
-    "movprfx z10.d, p5/m, z31.d \n\t" \
-    "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
-    "movprfx z11.d, p5/m, z31.d \n\t" \
-    "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
-    "mov z0.d, p5/m, z18.d \n\t" \
-    "mov z1.d, p5/m, z19.d \n\t" \
-    "mov z2.d, p5/m, z20.d \n\t" \
-    "mov z3.d, p5/m, z21.d \n\t" \
-    "mov z4.d, p5/m, z22.d \n\t" \
-    "mov z5.d, p5/m, z23.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YM_PROJ
-#define YM_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fadd z12.d, p5/m, z12.d, z21.d \n\t"  \
-    "fadd z13.d, p5/m, z13.d, z22.d \n\t"  \
-    "fadd z14.d, p5/m, z14.d, z23.d \n\t"  \
-    "fsub z15.d, p5/m, z15.d, z18.d \n\t" \
-    "fsub z16.d, p5/m, z16.d, z19.d \n\t" \
-    "fsub z17.d, p5/m, z17.d, z20.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// ZM_PROJ
-#define ZM_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \
-    "fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \
-    "fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \
-    "fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \
-    "fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \
-    "fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// TM_PROJ
-#define TM_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fsub z12.d, p5/m, z12.d, z18.d \n\t" \
-    "fsub z13.d, p5/m, z13.d, z19.d \n\t" \
-    "fsub z14.d, p5/m, z14.d, z20.d \n\t" \
-    "fsub z15.d, p5/m, z15.d, z21.d \n\t" \
-    "fsub z16.d, p5/m, z16.d, z22.d \n\t" \
-    "fsub z17.d, p5/m, z17.d, z23.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XM_RECON_ACCUM
-#define XM_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
-    "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
-    "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
-    "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
-    "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
-    "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YP_RECON_ACCUM
-#define YP_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fsub z9.d, p5/m, z9.d, z18.d \n\t" \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fsub z10.d, p5/m, z10.d, z19.d \n\t" \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fsub z11.d, p5/m, z11.d, z20.d \n\t" \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fadd z6.d, p5/m, z6.d, z21.d \n\t"  \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fadd z7.d, p5/m, z7.d, z22.d \n\t"  \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    "fadd z8.d, p5/m, z8.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YM_RECON_ACCUM
-#define YM_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fadd z9.d, p5/m, z9.d, z18.d \n\t"  \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fadd z10.d, p5/m, z10.d, z19.d \n\t"  \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fadd z11.d, p5/m, z11.d, z20.d \n\t"  \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fsub z6.d, p5/m, z6.d, z21.d \n\t" \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fsub z7.d, p5/m, z7.d, z22.d \n\t" \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    "fsub z8.d, p5/m, z8.d, z23.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// ZP_RECON_ACCUM
-#define ZP_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// ZM_RECON_ACCUM
-#define ZM_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// TP_RECON_ACCUM
-#define TP_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fadd z6.d, p5/m, z6.d, z18.d \n\t"  \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fadd z7.d, p5/m, z7.d, z19.d \n\t"  \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fadd z8.d, p5/m, z8.d, z20.d \n\t"  \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fadd z9.d, p5/m, z9.d, z21.d \n\t"  \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fadd z10.d, p5/m, z10.d, z22.d \n\t"  \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    "fadd z11.d, p5/m, z11.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// TM_RECON_ACCUM
-#define TM_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fsub z6.d, p5/m, z6.d, z18.d \n\t" \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fsub z7.d, p5/m, z7.d, z19.d \n\t" \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fsub z8.d, p5/m, z8.d, z20.d \n\t" \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fsub z9.d, p5/m, z9.d, z21.d \n\t" \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fsub z10.d, p5/m, z10.d, z22.d \n\t" \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    "fsub z11.d, p5/m, z11.d, z23.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// ZERO_PSI
-#define ZERO_PSI_A64FXd  \
-asm ( \
-    "fmov z0.d , 0 \n\t" \
-    "fmov z1.d , 0 \n\t" \
-    "fmov z2.d , 0 \n\t" \
-    "fmov z3.d , 0 \n\t" \
-    "fmov z4.d , 0 \n\t" \
-    "fmov z5.d , 0 \n\t" \
-    "fmov z6.d , 0 \n\t" \
-    "fmov z7.d , 0 \n\t" \
-    "fmov z8.d , 0 \n\t" \
-    "fmov z9.d , 0 \n\t" \
-    "fmov z10.d , 0 \n\t" \
-    "fmov z11.d , 0 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
-#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base)  \
-{ \
-asm ( \
-    "dc zva, %[fetchptr]\n\t" \
-    "dc zva, %[fetchptr]\n\t" \
-    "dc zva, %[fetchptr]\n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
-#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base)  \
-{ \
-asm ( \
-    "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// ADD_RESULT_INTERNAL
-#define ADD_RESULT_INTERNAL_A64FXd  \
-asm ( \
-    "fadd z0.d, p5/m, z0.d, z12.d \n\t"  \
-    "fadd z1.d, p5/m, z1.d, z13.d \n\t"  \
-    "fadd z2.d, p5/m, z2.d, z14.d \n\t"  \
-    "fadd z3.d, p5/m, z3.d, z15.d \n\t"  \
-    "fadd z4.d, p5/m, z4.d, z16.d \n\t"  \
-    "fadd z5.d, p5/m, z5.d, z17.d \n\t"  \
-    "fadd z6.d, p5/m, z6.d, z18.d \n\t"  \
-    "fadd z7.d, p5/m, z7.d, z19.d \n\t"  \
-    "fadd z8.d, p5/m, z8.d, z20.d \n\t"  \
-    "fadd z9.d, p5/m, z9.d, z21.d \n\t"  \
-    "fadd z10.d, p5/m, z10.d, z22.d \n\t"  \
-    "fadd z11.d, p5/m, z11.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h
deleted file mode 100644
index e629f617..00000000
--- a/Grid/simd/Fujitsu_A64FX_asm_single.h
+++ /dev/null
@@ -1,781 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: Fujitsu_A64FX_asm_single.h
-
-    Copyright (C) 2020
-
-Author: Nils Meyer <nils.meyer@ur.de>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#define LOAD_CHIMU(base)               LOAD_CHIMU_INTERLEAVED_A64FXf(base)  
-#define PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)  
-#define PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  
-#define PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)  
-#define PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  
-#define PF_GAUGE(A)  
-#define PREFETCH_RESULT_L2_STORE(A)    PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A)  
-#define PREFETCH_RESULT_L1_STORE(A)    PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A)  
-#define PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)  
-#define PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)  
-#define LOCK_GAUGE(A)  
-#define UNLOCK_GAUGE(A)  
-#define MASK_REGS                      DECLARATIONS_A64FXf  
-#define SAVE_RESULT(A,B)               RESULT_A64FXf(A);  
-#define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXf(Dir)  
-#define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXf  
-#define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)  
-#define ZERO_PSI                       ZERO_PSI_A64FXf  
-#define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)  
-#define XP_PROJ                        XP_PROJ_A64FXf  
-#define YP_PROJ                        YP_PROJ_A64FXf  
-#define ZP_PROJ                        ZP_PROJ_A64FXf  
-#define TP_PROJ                        TP_PROJ_A64FXf  
-#define XM_PROJ                        XM_PROJ_A64FXf  
-#define YM_PROJ                        YM_PROJ_A64FXf  
-#define ZM_PROJ                        ZM_PROJ_A64FXf  
-#define TM_PROJ                        TM_PROJ_A64FXf  
-#define XP_RECON                       XP_RECON_A64FXf  
-#define XM_RECON                       XM_RECON_A64FXf  
-#define XM_RECON_ACCUM                 XM_RECON_ACCUM_A64FXf  
-#define YM_RECON_ACCUM                 YM_RECON_ACCUM_A64FXf  
-#define ZM_RECON_ACCUM                 ZM_RECON_ACCUM_A64FXf  
-#define TM_RECON_ACCUM                 TM_RECON_ACCUM_A64FXf  
-#define XP_RECON_ACCUM                 XP_RECON_ACCUM_A64FXf  
-#define YP_RECON_ACCUM                 YP_RECON_ACCUM_A64FXf  
-#define ZP_RECON_ACCUM                 ZP_RECON_ACCUM_A64FXf  
-#define TP_RECON_ACCUM                 TP_RECON_ACCUM_A64FXf  
-#define PERMUTE_DIR0                   0  
-#define PERMUTE_DIR1                   1  
-#define PERMUTE_DIR2                   2  
-#define PERMUTE_DIR3                   3  
-#define PERMUTE                        PERMUTE_A64FXf;  
-#define LOAD_TABLE(Dir)                if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }  
-#define MAYBEPERM(A,perm)              if (perm) { PERMUTE; }  
-// DECLARATIONS
-#define DECLARATIONS_A64FXf  \
-    uint64_t baseU; \
-    const uint32_t lut[4][16] = { \
-        {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
-        {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
-        {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \
-        {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \
-asm ( \
-    "ptrue p5.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-asm ( \
-    "fmov z31.s , 0 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// RESULT
-#define RESULT_A64FXf(base)  \
-{ \
-asm ( \
-    "str z0, [%[storeptr], -6, mul vl] \n\t" \
-    "str z1, [%[storeptr], -5, mul vl] \n\t" \
-    "str z2, [%[storeptr], -4, mul vl] \n\t" \
-    "str z3, [%[storeptr], -3, mul vl] \n\t" \
-    "str z4, [%[storeptr], -2, mul vl] \n\t" \
-    "str z5, [%[storeptr], -1, mul vl] \n\t" \
-    "str z6, [%[storeptr], 0, mul vl] \n\t" \
-    "str z7, [%[storeptr], 1, mul vl] \n\t" \
-    "str z8, [%[storeptr], 2, mul vl] \n\t" \
-    "str z9, [%[storeptr], 3, mul vl] \n\t" \
-    "str z10, [%[storeptr], 4, mul vl] \n\t" \
-    "str z11, [%[storeptr], 5, mul vl] \n\t" \
-    :  \
-    : [storeptr] "r" (base + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_CHIMU_L2 (prefetch to L2)
-#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base)  \
-{ \
-asm ( \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_CHIMU_L1 (prefetch to L1)
-#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base)  \
-{ \
-asm ( \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_GAUGE_L2 (prefetch to L2)
-#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \
-{ \
-    const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
-asm ( \
-    "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_GAUGE_L1 (prefetch to L1)
-#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \
-{ \
-    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
-asm ( \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHI
-#define LOAD_CHI_A64FXf(base)  \
-{ \
-asm ( \
-    "ld1w { z12.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
-    "ld1w { z13.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
-    "ld1w { z14.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
-    "ld1w { z15.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
-    "ld1w { z16.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
-    "ld1w { z17.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHIMU
-#define LOAD_CHIMU_INTERLEAVED_A64FXf(base)  \
-{ \
-asm ( \
-    "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
-    "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
-    "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
-    "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
-    "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
-    "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
-    "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
-    "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
-    "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
-    "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
-    "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
-    "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHIMU_0213
-#define LOAD_CHIMU_0213_A64FXf  \
-{ \
-    const SiteSpinor & ref(in[offset]); \
-asm ( \
-    "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
-    "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
-    "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
-    "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
-    "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
-    "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
-    "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
-    "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
-    "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
-    "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
-    "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
-    "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (&ref[2][0]) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHIMU_0312
-#define LOAD_CHIMU_0312_A64FXf  \
-{ \
-    const SiteSpinor & ref(in[offset]); \
-asm ( \
-    "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
-    "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
-    "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
-    "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
-    "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
-    "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
-    "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
-    "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
-    "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
-    "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
-    "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
-    "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (&ref[2][0]) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_TABLE0
-#define LOAD_TABLE0  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (0) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_TABLE1
-#define LOAD_TABLE1  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (1) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_TABLE2
-#define LOAD_TABLE2  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (2) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_TABLE3
-#define LOAD_TABLE3  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (3) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// PERMUTE
-#define PERMUTE_A64FXf  \
-asm ( \
-    "tbl z12.s, { z12.s }, z30.s \n\t"  \
-    "tbl z13.s, { z13.s }, z30.s \n\t"  \
-    "tbl z14.s, { z14.s }, z30.s \n\t"  \
-    "tbl z15.s, { z15.s }, z30.s \n\t"  \
-    "tbl z16.s, { z16.s }, z30.s \n\t"  \
-    "tbl z17.s, { z17.s }, z30.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_GAUGE
-#define LOAD_GAUGE(A)  \
-{ \
-    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
-asm ( \
-    "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
-    "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
-    "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
-    "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
-    "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
-    "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// MULT_2SPIN
-#define MULT_2SPIN_1_A64FXf(A)  \
-{ \
-    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
-asm ( \
-    "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
-    "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
-    "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
-    "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
-    "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
-    "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
-    "movprfx z18.s, p5/m, z31.s \n\t" \
-    "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \
-    "movprfx z21.s, p5/m, z31.s \n\t" \
-    "fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \
-    "movprfx z19.s, p5/m, z31.s \n\t" \
-    "fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \
-    "movprfx z22.s, p5/m, z31.s \n\t" \
-    "fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \
-    "movprfx z20.s, p5/m, z31.s \n\t" \
-    "fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \
-    "movprfx z23.s, p5/m, z31.s \n\t" \
-    "fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \
-    "fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \
-    "fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \
-    "fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \
-    "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \
-    "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \
-    "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \
-    "ld1w { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
-    "ld1w { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
-    "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// MULT_2SPIN_BACKEND
-#define MULT_2SPIN_2_A64FXf  \
-{ \
-asm ( \
-    "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \
-    "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \
-    "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \
-    "fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \
-    "fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \
-    "fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \
-    "fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \
-    "fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \
-    "fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \
-    "fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \
-    "fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \
-    "fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \
-    "fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \
-    "fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \
-    "fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \
-    "fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \
-    "fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \
-    "fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \
-    "fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \
-    "fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \
-    "fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \
-    "fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \
-    "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \
-    "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XP_PROJ
-#define XP_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \
-    "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \
-    "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \
-    "fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \
-    "fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \
-    "fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XP_RECON
-#define XP_RECON_A64FXf  \
-asm ( \
-    "movprfx z6.s, p5/m, z31.s \n\t" \
-    "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
-    "movprfx z7.s, p5/m, z31.s \n\t" \
-    "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
-    "movprfx z8.s, p5/m, z31.s \n\t" \
-    "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
-    "movprfx z9.s, p5/m, z31.s \n\t" \
-    "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
-    "movprfx z10.s, p5/m, z31.s \n\t" \
-    "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
-    "movprfx z11.s, p5/m, z31.s \n\t" \
-    "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
-    "mov z0.s, p5/m, z18.s \n\t" \
-    "mov z1.s, p5/m, z19.s \n\t" \
-    "mov z2.s, p5/m, z20.s \n\t" \
-    "mov z3.s, p5/m, z21.s \n\t" \
-    "mov z4.s, p5/m, z22.s \n\t" \
-    "mov z5.s, p5/m, z23.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// XP_RECON_ACCUM
-#define XP_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YP_PROJ
-#define YP_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fsub z12.s, p5/m, z12.s, z21.s \n\t" \
-    "fsub z13.s, p5/m, z13.s, z22.s \n\t" \
-    "fsub z14.s, p5/m, z14.s, z23.s \n\t" \
-    "fadd z15.s, p5/m, z15.s, z18.s \n\t"  \
-    "fadd z16.s, p5/m, z16.s, z19.s \n\t"  \
-    "fadd z17.s, p5/m, z17.s, z20.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// ZP_PROJ
-#define ZP_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \
-    "fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \
-    "fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \
-    "fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \
-    "fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \
-    "fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// TP_PROJ
-#define TP_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fadd z12.s, p5/m, z12.s, z18.s \n\t"  \
-    "fadd z13.s, p5/m, z13.s, z19.s \n\t"  \
-    "fadd z14.s, p5/m, z14.s, z20.s \n\t"  \
-    "fadd z15.s, p5/m, z15.s, z21.s \n\t"  \
-    "fadd z16.s, p5/m, z16.s, z22.s \n\t"  \
-    "fadd z17.s, p5/m, z17.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XM_PROJ
-#define XM_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \
-    "fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \
-    "fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \
-    "fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \
-    "fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \
-    "fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XM_RECON
-#define XM_RECON_A64FXf  \
-asm ( \
-    "movprfx z6.s, p5/m, z31.s \n\t" \
-    "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
-    "movprfx z7.s, p5/m, z31.s \n\t" \
-    "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
-    "movprfx z8.s, p5/m, z31.s \n\t" \
-    "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
-    "movprfx z9.s, p5/m, z31.s \n\t" \
-    "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
-    "movprfx z10.s, p5/m, z31.s \n\t" \
-    "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
-    "movprfx z11.s, p5/m, z31.s \n\t" \
-    "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
-    "mov z0.s, p5/m, z18.s \n\t" \
-    "mov z1.s, p5/m, z19.s \n\t" \
-    "mov z2.s, p5/m, z20.s \n\t" \
-    "mov z3.s, p5/m, z21.s \n\t" \
-    "mov z4.s, p5/m, z22.s \n\t" \
-    "mov z5.s, p5/m, z23.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YM_PROJ
-#define YM_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fadd z12.s, p5/m, z12.s, z21.s \n\t"  \
-    "fadd z13.s, p5/m, z13.s, z22.s \n\t"  \
-    "fadd z14.s, p5/m, z14.s, z23.s \n\t"  \
-    "fsub z15.s, p5/m, z15.s, z18.s \n\t" \
-    "fsub z16.s, p5/m, z16.s, z19.s \n\t" \
-    "fsub z17.s, p5/m, z17.s, z20.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// ZM_PROJ
-#define ZM_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \
-    "fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \
-    "fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \
-    "fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \
-    "fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \
-    "fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// TM_PROJ
-#define TM_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fsub z12.s, p5/m, z12.s, z18.s \n\t" \
-    "fsub z13.s, p5/m, z13.s, z19.s \n\t" \
-    "fsub z14.s, p5/m, z14.s, z20.s \n\t" \
-    "fsub z15.s, p5/m, z15.s, z21.s \n\t" \
-    "fsub z16.s, p5/m, z16.s, z22.s \n\t" \
-    "fsub z17.s, p5/m, z17.s, z23.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XM_RECON_ACCUM
-#define XM_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
-    "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
-    "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
-    "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
-    "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
-    "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YP_RECON_ACCUM
-#define YP_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fsub z9.s, p5/m, z9.s, z18.s \n\t" \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fsub z10.s, p5/m, z10.s, z19.s \n\t" \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fsub z11.s, p5/m, z11.s, z20.s \n\t" \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fadd z6.s, p5/m, z6.s, z21.s \n\t"  \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fadd z7.s, p5/m, z7.s, z22.s \n\t"  \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    "fadd z8.s, p5/m, z8.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YM_RECON_ACCUM
-#define YM_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fadd z9.s, p5/m, z9.s, z18.s \n\t"  \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fadd z10.s, p5/m, z10.s, z19.s \n\t"  \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fadd z11.s, p5/m, z11.s, z20.s \n\t"  \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fsub z6.s, p5/m, z6.s, z21.s \n\t" \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fsub z7.s, p5/m, z7.s, z22.s \n\t" \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    "fsub z8.s, p5/m, z8.s, z23.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// ZP_RECON_ACCUM
-#define ZP_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// ZM_RECON_ACCUM
-#define ZM_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// TP_RECON_ACCUM
-#define TP_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fadd z6.s, p5/m, z6.s, z18.s \n\t"  \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fadd z7.s, p5/m, z7.s, z19.s \n\t"  \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fadd z8.s, p5/m, z8.s, z20.s \n\t"  \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fadd z9.s, p5/m, z9.s, z21.s \n\t"  \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fadd z10.s, p5/m, z10.s, z22.s \n\t"  \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    "fadd z11.s, p5/m, z11.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// TM_RECON_ACCUM
-#define TM_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fsub z6.s, p5/m, z6.s, z18.s \n\t" \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fsub z7.s, p5/m, z7.s, z19.s \n\t" \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fsub z8.s, p5/m, z8.s, z20.s \n\t" \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fsub z9.s, p5/m, z9.s, z21.s \n\t" \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fsub z10.s, p5/m, z10.s, z22.s \n\t" \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    "fsub z11.s, p5/m, z11.s, z23.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// ZERO_PSI
-#define ZERO_PSI_A64FXf  \
-asm ( \
-    "fmov z0.s , 0 \n\t" \
-    "fmov z1.s , 0 \n\t" \
-    "fmov z2.s , 0 \n\t" \
-    "fmov z3.s , 0 \n\t" \
-    "fmov z4.s , 0 \n\t" \
-    "fmov z5.s , 0 \n\t" \
-    "fmov z6.s , 0 \n\t" \
-    "fmov z7.s , 0 \n\t" \
-    "fmov z8.s , 0 \n\t" \
-    "fmov z9.s , 0 \n\t" \
-    "fmov z10.s , 0 \n\t" \
-    "fmov z11.s , 0 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
-#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base)  \
-{ \
-asm ( \
-    "dc zva, %[fetchptr]\n\t" \
-    "dc zva, %[fetchptr]\n\t" \
-    "dc zva, %[fetchptr]\n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
-#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base)  \
-{ \
-asm ( \
-    "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// ADD_RESULT_INTERNAL
-#define ADD_RESULT_INTERNAL_A64FXf  \
-asm ( \
-    "fadd z0.s, p5/m, z0.s, z12.s \n\t"  \
-    "fadd z1.s, p5/m, z1.s, z13.s \n\t"  \
-    "fadd z2.s, p5/m, z2.s, z14.s \n\t"  \
-    "fadd z3.s, p5/m, z3.s, z15.s \n\t"  \
-    "fadd z4.s, p5/m, z4.s, z16.s \n\t"  \
-    "fadd z5.s, p5/m, z5.s, z17.s \n\t"  \
-    "fadd z6.s, p5/m, z6.s, z18.s \n\t"  \
-    "fadd z7.s, p5/m, z7.s, z19.s \n\t"  \
-    "fadd z8.s, p5/m, z8.s, z20.s \n\t"  \
-    "fadd z9.s, p5/m, z9.s, z21.s \n\t"  \
-    "fadd z10.s, p5/m, z10.s, z22.s \n\t"  \
-    "fadd z11.s, p5/m, z11.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-

From 45d49d86487427ea1e0b34c0d530d475f8e3e31a Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Sat, 19 Dec 2020 03:35:18 +0100
Subject: [PATCH 06/16] clean up

---
 .../implementation/WilsonKernelsAsmBodyA64FX.h | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h
index 83588a7d..4e463438 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h
@@ -26,9 +26,9 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 *************************************************************************************/
 /*  END LEGAL */
 
-// GCC 10 messes up SVE instruction scheduling using -O3 only,
-// using -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
-// performance is better than armclang 20.2
+// GCC 10 messes up SVE instruction scheduling using -O3, but
+// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
+// performance now is better than armclang 20.2
 
 #ifdef KERNEL_DAG
 #define DIR0_PROJ    XP_PROJ
@@ -118,10 +118,6 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 /*
 NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
     though I expected that it would improve on performance
-
-    if (s == 0) {                                           \
-      if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
-    }        \
 */
 
 #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \
@@ -149,7 +145,7 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
       if ( local || st.same_node[Dir] ) {				\
   MULT_2SPIN_1(Dir);					                    \
   MULT_2SPIN_2;					                        \
-	RECON;								\
+  RECON;								\
       }									\
   base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
   PREFETCH_CHIMU(base);						\
@@ -300,7 +296,7 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
 
       // DC ZVA test
       // { uint64_t basestore = (uint64_t)&out[ss];
-      //  PREFETCH_RESULT_L2_STORE(basestore); }
+      //   PREFETCH_RESULT_L2_STORE(basestore); }
 
 
       ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
@@ -336,8 +332,8 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
 
       // DC ZVA test
       //{ uint64_t basestore = (uint64_t)&out[ss];
-      //  PREFETCH_RESULT_L2_STORE(basestore);
-      //}
+      //  PREFETCH_RESULT_L2_STORE(basestore); }
+
 
       ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
 

From a4afc3ea2aeb23a5a5a4dece03087e6344c9986b Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 14 Jan 2021 20:44:16 -0500
Subject: [PATCH 07/16] Red black coarse space

---
 tests/solver/Test_dwf_hdcr.cc | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc
index 8e083231..f68e99ab 100644
--- a/tests/solver/Test_dwf_hdcr.cc
+++ b/tests/solver/Test_dwf_hdcr.cc
@@ -222,9 +222,16 @@ int main (int argc, char ** argv)
 
   GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
   GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
-  GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+
+
+  GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridCartesian *CoarseCoarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d);
 
+  GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d);
+  GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d);
+  GridRedBlackCartesian *CoarseCoarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseCoarse4d);
+  GridRedBlackCartesian *CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d);
+
   std::vector<int> seeds4({1,2,3,4});
   std::vector<int> seeds5({5,6,7,8});
   std::vector<int> cseeds({5,6,7,8});
@@ -282,8 +289,7 @@ int main (int argc, char ** argv)
 
   Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
 
-  Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
-
+  Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
 
   //////////////////////////////////////////////////
   // Deflate the course space. Recursive multigrid?
@@ -311,12 +317,11 @@ int main (int argc, char ** argv)
     }
   }
 
-  Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix
+  Level2Op L2Op(*CoarseCoarse5d,*CoarseCoarse5dRB,1); // Hermitian matrix
   typedef Level2Op::CoarseVector CoarseCoarseVector;
   HermitianLinearOperator<Level1Op,CoarseVector> L1LinOp(LDOp);
   L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates);
 
-
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << " Running CoarseCoarse grid Lanczos "<< std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;

From e8e20c01b23482cf3ba76b264d799ce6cdfe619e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 14 Jan 2021 20:46:21 -0500
Subject: [PATCH 08/16] Coarsened vector test

---
 Grid/qcd/QCD.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h
index 76d7def4..858aead7 100644
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -80,6 +80,13 @@ template<typename T> struct isSpinor {
 template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
 template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
 
+const int CoarseIndex = 4;
+template<typename T> struct isCoarsened {
+   static constexpr bool value = (CoarseIndex<=T::TensorLevel);
+};
+template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
+template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
+
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 

From eaff0f3aeb05635d49e17cb6e271621040f5b7f1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 14 Jan 2021 20:46:58 -0500
Subject: [PATCH 09/16] Gamma5 on coaree spaces

---
 Grid/qcd/spin/TwoSpinor.h | 179 ++++++++------------------------------
 1 file changed, 35 insertions(+), 144 deletions(-)

diff --git a/Grid/qcd/spin/TwoSpinor.h b/Grid/qcd/spin/TwoSpinor.h
index 924594ab..8dad0cd0 100644
--- a/Grid/qcd/spin/TwoSpinor.h
+++ b/Grid/qcd/spin/TwoSpinor.h
@@ -128,7 +128,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjTm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   hspin(0)=fspin(0)-fspin(2);
   hspin(1)=fspin(1)-fspin(3);
 }
@@ -138,40 +137,50 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
  *  0 0 -1  0
  *  0 0  0 -1
  */
-
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   hspin(0)=fspin(0);
   hspin(1)=fspin(1);
 }
 
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   hspin(0)=fspin(2);
   hspin(1)=fspin(3);
 }
   
-//  template<class vtype> accelerator_inline void fspProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   rfspin(0)=fspin(0);
   rfspin(1)=fspin(1);
   rfspin(2)=Zero();
   rfspin(3)=Zero();
 }
-//  template<class vtype> accelerator_inline void fspProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   rfspin(0)=Zero();
   rfspin(1)=Zero();
   rfspin(2)=fspin(2);
   rfspin(3)=fspin(3);
 }
 
+template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
+{
+  const int hN = N>>1;
+  for(int s=0;s<hN;s++){
+    rfspin(s)=fspin(s);
+    rfspin(s+hN)=Zero();
+  }
+}
+template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
+{
+  const int hN = N>>1;
+  for(int s=0;s<hN;s++){
+    rfspin(s)=Zero();
+    rfspin(s+hN)=fspin(s+hN);
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Reconstruction routines to move back again to four spin
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -183,7 +192,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
  */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)=hspin(0);
   fspin(1)=hspin(1);
   fspin(2)=timesMinusI(hspin(1));
@@ -191,7 +199,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)=hspin(0);
   fspin(1)=hspin(1);
   fspin(2)=timesI(hspin(1));
@@ -199,7 +206,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)+=hspin(0);
   fspin(1)+=hspin(1);
   fspin(2)-=timesI(hspin(1));
@@ -207,7 +213,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)+=hspin(0);
   fspin(1)+=hspin(1);
   fspin(2)+=timesI(hspin(1));
@@ -221,7 +226,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)=hspin(0);
   fspin(1)=hspin(1);
   fspin(2)= hspin(1);
@@ -229,7 +233,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)=hspin(0);
   fspin(1)=hspin(1);
   fspin(2)=-hspin(1);
@@ -237,7 +240,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)+=hspin(0);
   fspin(1)+=hspin(1);
   fspin(2)+=hspin(1);
@@ -245,7 +247,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)+=hspin(0);
   fspin(1)+=hspin(1);
   fspin(2)-=hspin(1);
@@ -260,7 +261,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
  */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)=hspin(0);
   fspin(1)=hspin(1);
   fspin(2)=timesMinusI(hspin(0));
@@ -268,7 +268,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)=hspin(0);
   fspin(1)=hspin(1);
   fspin(2)=     timesI(hspin(0));
@@ -276,7 +275,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)+=hspin(0);
   fspin(1)+=hspin(1);
   fspin(2)-=timesI(hspin(0));
@@ -284,7 +282,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)+=hspin(0);
   fspin(1)+=hspin(1);
   fspin(2)+=timesI(hspin(0));
@@ -298,7 +295,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
  */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)=hspin(0);
   fspin(1)=hspin(1);
   fspin(2)=hspin(0);
@@ -306,7 +302,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)=hspin(0);
   fspin(1)=hspin(1);
   fspin(2)=-hspin(0);
@@ -314,7 +309,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)+=hspin(0);
   fspin(1)+=hspin(1);
   fspin(2)+=hspin(0);
@@ -322,7 +316,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)+=hspin(0);
   fspin(1)+=hspin(1);
   fspin(2)-=hspin(0);
@@ -336,7 +329,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
  */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul
   fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though
   fspin(2)=Zero();
@@ -344,7 +336,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5m (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)=Zero();
   fspin(1)=Zero();
   fspin(2)=hspin(0)+hspin(0);
@@ -352,7 +343,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
   fspin(0)+=hspin(0)+hspin(0);
   fspin(1)+=hspin(1)+hspin(1);
 }
@@ -372,7 +362,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 //////////
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spProjXp(hspin._internal[i],fspin._internal[i]);
   }
@@ -426,26 +415,21 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconXp (iM
     }}
 }
 
-
-
 ////////
 // Xm
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spProjXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spProjXm(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spProjXm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -455,19 +439,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatri
 
 template<class rtype,class vtype> accelerator_inline void spReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spReconXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spReconXm(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spReconXm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -476,45 +457,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatr
 
 template<class rtype,class vtype> accelerator_inline void accumReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   accumReconXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     accumReconXm(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       accumReconXm(hspin._internal[i][j],fspin._internal[i][j]);
     }}
 }
 
-
-
 ////////
 // Yp
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spProjYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spProjYp(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spProjYp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -524,19 +497,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatri
 
 template<class rtype,class vtype> accelerator_inline void spReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spReconYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spReconYp(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spReconYp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -545,66 +515,55 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatr
 
 template<class rtype,class vtype> accelerator_inline void accumReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   accumReconYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     accumReconYp(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       accumReconYp(hspin._internal[i][j],fspin._internal[i][j]);
     }}
 }
 
-
 ////////
 // Ym
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spProjYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spProjYm(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spProjYm(hspin._internal[i][j],fspin._internal[i][j]);
     }}
 }
 
-
 template<class rtype,class vtype> accelerator_inline void spReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spReconYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,const iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spReconYm(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spReconYm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -613,19 +572,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatr
 
 template<class rtype,class vtype> accelerator_inline void accumReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   accumReconYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     accumReconYm(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       accumReconYm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -638,66 +594,57 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spProjZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spProjZp(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spProjZp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }
 
 
 template<class rtype,class vtype> accelerator_inline void spReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spReconZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spReconZp(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spReconZp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }
 
 template<class rtype,class vtype> accelerator_inline void accumReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   accumReconZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     accumReconZp(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       accumReconZp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }
 
 
@@ -706,62 +653,53 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spProjZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spProjZm(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spProjZm(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }
 
 
 template<class rtype,class vtype> accelerator_inline void spReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spReconZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spReconZm(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spReconZm(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }
 
 template<class rtype,class vtype> accelerator_inline void accumReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   accumReconZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     accumReconZm(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       accumReconZm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -774,41 +712,35 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spProjTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spProjTp(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spProjTp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }
 
 
 template<class rtype,class vtype> accelerator_inline void spReconTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spReconTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spReconTp(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spReconTp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -817,44 +749,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatr
 
 template<class rtype,class vtype> accelerator_inline void accumReconTp (iScalar<rtype> &hspin, iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   accumReconTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTp (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     accumReconTp(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconTp (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       accumReconTp(hspin._internal[i][j],fspin._internal[i][j]);
     }}
 }
 
-
 ////////
 // Tm
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjTm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spProjTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spProjTm(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spProjTm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -864,19 +789,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatri
 
 template<class rtype,class vtype> accelerator_inline void spReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spReconTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spReconTm(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spReconTm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -885,44 +807,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatr
 
 template<class rtype,class vtype> accelerator_inline void accumReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   accumReconTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     accumReconTm(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       accumReconTm(hspin._internal[i][j],fspin._internal[i][j]);
     }}
 }
 
-
 ////////
 // 5p
 ////////
-template<class rtype,class vtype> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
+template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spProj5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spProj5p(hspin._internal[i],fspin._internal[i]);
   }
 }
-template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -931,19 +846,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatri
 
 template<class rtype,class vtype> accelerator_inline void spRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spRecon5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spRecon5p(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -952,19 +864,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatr
 
 template<class rtype,class vtype> accelerator_inline void accumRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   accumRecon5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     accumRecon5p(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       accumRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -972,24 +881,18 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iM
 }
 
 // four spinor projectors for chiral proj
-//  template<class vtype> accelerator_inline void fspProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
-template<class vtype> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
+template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spProj5p(hspin._internal,fspin._internal);
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5p (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
-template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spProj5p(hspin._internal[i],fspin._internal[i]);
   }
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5p (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
-template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -1001,17 +904,17 @@ template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &
 // 5m
 ////////
 
-template<class rtype,class vtype> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
+template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
   spProj5m(hspin._internal,fspin._internal);
 }
-template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
   for(int i=0;i<N;i++) {
     spProj5m(hspin._internal[i],fspin._internal[i]);
   }
 }
-template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
@@ -1021,40 +924,34 @@ template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatri
 
 template<class rtype,class vtype> accelerator_inline void spRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spRecon5m(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spRecon5m(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }
 
 template<class rtype,class vtype> accelerator_inline void accumRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   accumRecon5m(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     accumRecon5m(hspin._internal[i],fspin._internal[i]);
   }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       accumRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
@@ -1063,24 +960,18 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iM
 
 
 // four spinor projectors for chiral proj
-//  template<class vtype> accelerator_inline void fspProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
-template<class vtype> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
+template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
   spProj5m(hspin._internal,fspin._internal);
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5m (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
-template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
   for(int i=0;i<N;i++) {
     spProj5m(hspin._internal[i],fspin._internal[i]);
   }
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5m (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
-template<class vtype,int N> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
   for(int i=0;i<N;i++){ 
     for(int j=0;j<N;j++){
       spProj5m(hspin._internal[i][j],fspin._internal[i][j]);

From d8fa903b024ac7dc831b567ddd4e95f05f45eb99 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 14 Jan 2021 20:47:28 -0500
Subject: [PATCH 10/16] G5 on coarse spaces

---
 Grid/qcd/utils/LinalgUtils.h | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/Grid/qcd/utils/LinalgUtils.h b/Grid/qcd/utils/LinalgUtils.h
index 1e016e4e..964b83d5 100644
--- a/Grid/qcd/utils/LinalgUtils.h
+++ b/Grid/qcd/utils/LinalgUtils.h
@@ -154,8 +154,8 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
   accelerator_for(sss,nloop,vobj::Nsimd(),{
     uint64_t ss = sss*Ls;
     decltype(coalescedRead(y_v[ss+sp])) tmp;
-    spProj5m(tmp,y_v(ss+sp));
-    tmp = a*x_v(ss+s)+b*tmp;
+    spProj5m(tmp,y_v(ss+sp)); 
+   tmp = a*x_v(ss+s)+b*tmp;
     coalescedWrite(z_v[ss+s],tmp);
   });
 }
@@ -188,7 +188,6 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
   z.Checkerboard() = x.Checkerboard();
   conformable(x,z);
   int Ls = grid->_rdimensions[0];
-  Gamma G5(Gamma::Algebra::Gamma5);
   autoView( x_v, x, AcceleratorRead);
   autoView( z_v, z, AcceleratorWrite);
   uint64_t nloop = grid->oSites()/Ls;
@@ -196,7 +195,13 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
     uint64_t ss = sss*Ls;
     for(int s=0;s<Ls;s++){
       int sp = Ls-1-s;
-      coalescedWrite(z_v[ss+sp],G5*x_v(ss+s));
+      auto tmp = x_v(ss+s);
+      decltype(tmp) tmp_p;
+      decltype(tmp) tmp_m;
+      spProj5p(tmp_p,tmp);
+      spProj5m(tmp_m,tmp);
+      // Use of spProj5m, 5p captures the coarse space too
+      coalescedWrite(z_v[ss+sp],tmp_p - tmp_m);
     }
   });
 }
@@ -208,10 +213,20 @@ void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
   z.Checkerboard() = x.Checkerboard();
   conformable(x, z);
 
-  Gamma G5(Gamma::Algebra::Gamma5);
-  z = G5 * x;
+  autoView( x_v, x, AcceleratorRead);
+  autoView( z_v, z, AcceleratorWrite);
+  uint64_t nloop = grid->oSites();
+  accelerator_for(ss,nloop,vobj::Nsimd(),{
+    auto tmp = x_v(ss);
+    decltype(tmp) tmp_p;
+    decltype(tmp) tmp_m;
+    spProj5p(tmp_p,tmp);
+    spProj5m(tmp_m,tmp);
+    coalescedWrite(z_v[ss],tmp_p - tmp_m);
+  });
 }
 
+/*
 template<class CComplex, int nbasis>
 void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x)
 {
@@ -234,6 +249,7 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex,
     }
   });
 }
+*/
 
 NAMESPACE_END(Grid);
 

From 281ac5fc12af38e58c6224a5a86d1d94ac742526 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 14 Jan 2021 20:48:08 -0500
Subject: [PATCH 11/16] Red black support on coars

---
 Grid/algorithms/CoarsenedMatrix.h | 60 +++++++++++++++++++------------
 1 file changed, 37 insertions(+), 23 deletions(-)

diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
index 66b9c169..b9594678 100644
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -775,7 +775,26 @@ public:
     for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
   }
   
- CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) 	:
+  CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	:
+    _grid(&CoarseGrid),
+    _cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
+    geom(CoarseGrid._ndimension),
+    hermitian(hermitian_),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
+    A(geom.npoint,&CoarseGrid),
+    Aeven(geom.npoint,_cbgrid),
+    Aodd(geom.npoint,_cbgrid),
+    AselfInv(&CoarseGrid),
+    AselfInvEven(_cbgrid),
+    AselfInvOdd(_cbgrid),
+    dag_factor(nbasis*nbasis)
+  {
+    fillFactor();
+  };
+
+  CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) 	:
 
     _grid(&CoarseGrid),
     _cbgrid(&CoarseRBGrid),
@@ -817,6 +836,8 @@ public:
     typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
     typedef typename Fobj::scalar_type scalar_type;
 
+    std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl;
+
     FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
     FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
 
@@ -847,11 +868,13 @@ public:
 
     CoarseScalar InnerProd(Grid()); 
 
+    std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl;
     // Orthogonalise the subblocks over the basis
     blockOrthogonalise(InnerProd,Subspace.subspace);
 
     // Compute the matrix elements of linop between this orthonormal
     // set of vectors.
+    std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl;
     int self_stencil=-1;
     for(int p=0;p<geom.npoint;p++)
     { 
@@ -890,7 +913,7 @@ public:
 
       phi=Subspace.subspace[i];
 
-      //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
+      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
       linop.OpDirAll(phi,Mphi_p);
       linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);
 
@@ -919,6 +942,18 @@ public:
 	    autoView( A_self  , A[self_stencil], AcceleratorWrite);
 
 	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
+	    if ( hermitian && (disp==-1) ) {
+	      for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
+		int dirp   = geom.directions[pp];
+		int dispp  = geom.displacements[pp];
+		if ( (dirp==dir) && (dispp==1) ){
+		  auto sft = conjugate(Cshift(oZProj,dir,1));
+		  autoView( sft_v    ,  sft  , AcceleratorWrite);
+		  autoView( A_pp     ,  A[pp], AcceleratorWrite);
+		  accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); });
+		}
+	      }
+	    }
 
 	  }
 	}
@@ -957,33 +992,12 @@ public:
     }
     if(hermitian) {
       std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
-      ForceHermitian();
     }
 
     InvertSelfStencilLink(); std::cout << GridLogMessage << "Coarse self link inverted" << std::endl;
     FillHalfCbs(); std::cout << GridLogMessage << "Coarse half checkerboards filled" << std::endl;
   }
 
-  void ForceHermitian(void) {
-    CoarseMatrix Diff  (Grid());
-    for(int p=0;p<geom.npoint;p++){
-      int dir   = geom.directions[p];
-      int disp  = geom.displacements[p];
-      if(disp==-1) {
-	// Find the opposite link
-	for(int pp=0;pp<geom.npoint;pp++){
-	  int dirp   = geom.directions[pp];
-	  int dispp  = geom.displacements[pp];
-	  if ( (dirp==dir) && (dispp==1) ){
-	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp]; 
-	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
-	    A[pp] = adj(Cshift(A[p],dir,1));
-	  }
-	}
-      }
-    }
-  }
-
   void InvertSelfStencilLink() {
     std::cout << GridLogDebug << "CoarsenedMatrix::InvertSelfStencilLink" << std::endl;
     int localVolume = Grid()->lSites();

From 579595f547bd36775ba42ecd07f9a881e4f12e85 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 14 Jan 2021 20:48:35 -0500
Subject: [PATCH 12/16] Red black on coarse space

---
 tests/solver/Test_dwf_hdcr_2level.cc | 8 ++++++--
 tests/solver/Test_dwf_multigrid.cc   | 9 +++++++--
 tests/solver/Test_hw_multigrid.cc    | 4 +++-
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/tests/solver/Test_dwf_hdcr_2level.cc b/tests/solver/Test_dwf_hdcr_2level.cc
index df24c9d2..4fa1e302 100644
--- a/tests/solver/Test_dwf_hdcr_2level.cc
+++ b/tests/solver/Test_dwf_hdcr_2level.cc
@@ -262,6 +262,8 @@ int main (int argc, char ** argv)
 
   GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
   GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+  GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d);
+  GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d);
 
   std::vector<int> seeds4({1,2,3,4});
   std::vector<int> seeds5({5,6,7,8});
@@ -328,7 +330,7 @@ int main (int argc, char ** argv)
 
   Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
 
-  Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
+  Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
 
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << " Running Coarse grid Lanczos "<< std::endl;
@@ -352,7 +354,9 @@ int main (int argc, char ** argv)
 
   //  ConjugateGradient<CoarseVector>  CoarseCG(0.01,1000);
   
-  ConjugateGradient<CoarseVector>  CoarseCG(0.02,1000);// 14.7s
+  ConjugateGradient<CoarseVector>  CoarseCG(0.01,2000);// 14.7s
+  eval.resize(0);
+  evec.resize(0,Coarse5d);
   DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
   NormalEquations<CoarseVector> DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser);
 
diff --git a/tests/solver/Test_dwf_multigrid.cc b/tests/solver/Test_dwf_multigrid.cc
index 9e11c160..351e10fd 100644
--- a/tests/solver/Test_dwf_multigrid.cc
+++ b/tests/solver/Test_dwf_multigrid.cc
@@ -370,6 +370,11 @@ int main (int argc, char ** argv)
   GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
   GridCartesian *CoarseCoarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d);
 
+  GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d);
+  GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d);
+  GridRedBlackCartesian *CoarseCoarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseCoarse4d);
+  GridRedBlackCartesian *CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d);
+
   std::vector<int> seeds4({1,2,3,4});
   std::vector<int> seeds5({5,6,7,8});
   std::vector<int> cseeds({5,6,7,8});
@@ -434,8 +439,8 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 
-  Level1Op LDOp(*Coarse5d,1);   LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
-  Level1Op LDOpPV(*Coarse5d,1); LDOpPV.CoarsenOperator(FGrid,HermIndefOpPV,Aggregates);
+  Level1Op LDOp(*Coarse5d,*Coarse5dRB,1);   LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
+  Level1Op LDOpPV(*Coarse5d,*Coarse5dRB,1); LDOpPV.CoarsenOperator(FGrid,HermIndefOpPV,Aggregates);
 
 
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
diff --git a/tests/solver/Test_hw_multigrid.cc b/tests/solver/Test_hw_multigrid.cc
index b728faa7..66c88883 100644
--- a/tests/solver/Test_hw_multigrid.cc
+++ b/tests/solver/Test_hw_multigrid.cc
@@ -274,6 +274,8 @@ int main (int argc, char ** argv)
 
   GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
   GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(Ls,Coarse4d);
+  GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d);
+  GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d);
 
   std::vector<int> seeds({1,2,3,4});
   GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds);
@@ -335,7 +337,7 @@ int main (int argc, char ** argv)
 
   NonHermitianLinearOperator<DomainWallFermionR,LatticeFermion>  LinOpDwf(Ddwf);
 
-  Level1Op LDOp  (*Coarse5d,0);   
+  Level1Op LDOp  (*Coarse5d,*Coarse5dRB,0);   
   
   std::cout<<GridLogMessage << " Callinig Coarsen the operator                          " <<std::endl;
   LDOp.CoarsenOperator(FGrid,LinOpDwf,Aggregates5D);

From 97a59643f74423185c1bdd5f2f36146d247317db Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 14 Jan 2021 20:49:13 -0500
Subject: [PATCH 13/16] Red black coarse space

---
 tests/solver/Test_dwf_hdcr_16_rb.cc           |  397 +++++
 tests/solver/Test_dwf_hdcr_24_regression.cc   |  477 ++++++
 tests/solver/Test_dwf_hdcr_48_rb.cc           |  397 +++++
 tests/solver/Test_dwf_hdcr_48_regression.cc   |  473 ++++++
 tests/solver/Test_hw_multigrid_mixed_48.cc    | 1287 ++++++++++++++++
 tests/solver/Test_hw_multigrid_mixed_48_rb.cc | 1326 +++++++++++++++++
 6 files changed, 4357 insertions(+)
 create mode 100644 tests/solver/Test_dwf_hdcr_16_rb.cc
 create mode 100644 tests/solver/Test_dwf_hdcr_24_regression.cc
 create mode 100644 tests/solver/Test_dwf_hdcr_48_rb.cc
 create mode 100644 tests/solver/Test_dwf_hdcr_48_regression.cc
 create mode 100644 tests/solver/Test_hw_multigrid_mixed_48.cc
 create mode 100644 tests/solver/Test_hw_multigrid_mixed_48_rb.cc

diff --git a/tests/solver/Test_dwf_hdcr_16_rb.cc b/tests/solver/Test_dwf_hdcr_16_rb.cc
new file mode 100644
index 00000000..b7900b04
--- /dev/null
+++ b/tests/solver/Test_dwf_hdcr_16_rb.cc
@@ -0,0 +1,397 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_hdcr.cc
+
+    Copyright (C) 2015
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+
+using namespace std;
+using namespace Grid;
+/* Params
+ * Grid: 
+ * block1(4)
+ * block2(4)
+ * 
+ * Subspace
+ * * Fine  : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100
+ * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100
+
+ * Smoother:
+ * * Fine: Cheby(hi, lo, order)            --  60,0.5,10
+ * * Coarse: Cheby(hi, lo, order)          --  12,0.1,4
+
+ * Lanczos:
+ * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order))   24,36,24,0.002,4.0,61 
+ */
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+template<class Field> class SolverWrapper : public LinearFunction<Field> {
+private:
+  CheckerBoardedSparseMatrixBase<Field> & _Matrix;
+  SchurRedBlackBase<Field> & _Solver;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+  SolverWrapper(CheckerBoardedSparseMatrixBase<Field> &Matrix,
+		SchurRedBlackBase<Field> &Solver)
+   :  _Matrix(Matrix), _Solver(Solver) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    _Solver(_Matrix,in,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & _SmootherMatrix;
+  FineOperator   & _SmootherOperator;
+  
+  Chebyshev<Field> Cheby;
+
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) :
+    _SmootherOperator(SmootherOperator),
+    _SmootherMatrix(SmootherMatrix),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    MdagMLinearOperator<Matrix,Field>   MdagMOp(_SmootherMatrix); 
+    _SmootherOperator.AdjOp(in,tmp);
+    Cheby(MdagMOp,tmp,out);         
+  }
+};
+template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & SmootherMatrix;
+  FineOperator   & SmootherOperator;
+  RealD tol;
+  RealD shift;
+  int   maxit;
+
+  MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) :
+    shift(_shift),tol(_tol),maxit(_maxit),
+    SmootherOperator(_SmootherOperator),
+    SmootherMatrix(_SmootherMatrix)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    ZeroGuesser<Field> Guess;
+    ConjugateGradient<Field>  CG(tol,maxit,false);
+ 
+    Field src(in.Grid());
+
+    ShiftedMdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(SmootherMatrix,shift);
+    SmootherOperator.AdjOp(in,src);
+    Guess(src,out);
+    CG(MdagMOp,src,out); 
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
+class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+
+  Aggregates     & _Aggregates;
+  CoarseOperator & _CoarseOperator;
+  Matrix         & _FineMatrix;
+  FineOperator   & _FineOperator;
+  Guesser        & _Guess;
+  FineSmoother   & _Smoother;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+#define GridLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level <<" "
+
+  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
+			  FineOperator &Fine,Matrix &FineMatrix,
+			  FineSmoother &Smoother,
+			  Guesser &Guess_,
+			  CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _CoarseOperator(Coarse),
+      _FineOperator(Fine),
+      _FineMatrix(FineMatrix),
+      _Smoother(Smoother),
+      _Guess(Guess_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    CoarseVector Csrc(_CoarseOperator.Grid());
+    CoarseVector Csol(_CoarseOperator.Grid()); 
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(in,out);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    GridLogLevel << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    t+=usecond();
+    GridLogLevel << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    GridLogLevel << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(vec1,vec2);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  ///////////////////////////////////////////////////
+  // Construct a coarsened grid; utility for this?
+  ///////////////////////////////////////////////////
+  std::vector<int> block ({2,2,2,2});
+  std::vector<int> blockc ({2,2,2,2});
+  const int nbasis= 32;
+  const int nbasisc= 32;
+  auto clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/block[d];
+  }
+  auto cclatt = clatt;
+  for(int d=0;d<clatt.size();d++){
+    cclatt[d] = clatt[d]/blockc[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+  //  GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  //  GridCartesian *CoarseCoarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+  LatticeFermion    src(FGrid); gaussian(RNG5,src);// src=src+g5*src;
+  LatticeFermion result(FGrid); 
+  LatticeGaugeField Umu(UGrid); 
+
+  FieldMetaData header;
+  std::string file("./ckpoint_lat.4000");
+  //std::string file("./ckpoint_lat.1000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  RealD mass=0.001;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
+  typedef CoarseOperator::CoarseVector                                 CoarseVector;
+  typedef CoarseOperator::siteVector siteVector;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
+
+  Subspace Aggregates(Coarse5d,FGrid,0);
+
+  assert ( (nbasis & 0x1)==0);
+  {
+    int nb=nbasis/2;
+    LatticeFermion A(FGrid);
+    LatticeFermion B(FGrid);
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0);
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); 
+    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,150,0.0);//
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,1000,100,100,0.0); // Slightly faster
+
+    for(int n=0;n<nb;n++){
+      std::cout << GridLogMessage << " G5R5 "<<n<<std::endl;
+      G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
+      std::cout << GridLogMessage << " Projection "<<n<<std::endl;
+      A = Aggregates.subspace[n];
+      B = Aggregates.subspace[n+nb];
+      std::cout << GridLogMessage << " Copy "<<n<<std::endl;
+      Aggregates.subspace[n]   = A+B; // 1+G5 // eigen value of G5R5 is +1
+      std::cout << GridLogMessage << " P+ "<<n<<std::endl;
+      Aggregates.subspace[n+nb]= A-B; // 1-G5 // eigen value of G5R5 is -1
+      std::cout << GridLogMessage << " P- "<<n<<std::endl;
+    }
+  }
+  
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>    Level1Op;
+  typedef CoarsenedMatrix<siteVector,iScalar<vTComplex>,nbasisc> Level2Op;
+
+  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
+
+  
+  GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d);
+  std::cout << " Making 5D coarse RB grid " <<std::endl;
+  GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d);
+  std::cout << " Made 5D coarse RB grid " <<std::endl;
+  Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
+
+  //////////////////////////////////////////////////
+  // Deflate the course space. Recursive multigrid?
+  //////////////////////////////////////////////////
+  typedef Aggregation<siteVector,iScalar<vTComplex>,nbasisc>                   CoarseSubspace;
+  //  CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Build deflation space in coarse operator "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
+  typedef Level2Op::CoarseVector CoarseCoarseVector;
+  CoarseVector c_src(Coarse5d); c_src=1.0;
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building 3 level Multigrid            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,ZeroGuesser<CoarseVector> , SolverWrapper<CoarseVector> >   TwoLevelMG;
+  typedef MultiGridPreconditioner<siteVector,iScalar<vTComplex>,nbasisc,Level1Op, DeflatedGuesser<CoarseCoarseVector>, NormalEquations<CoarseCoarseVector> > CoarseMG;
+  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,ZeroGuesser<CoarseVector>, LinearFunction<CoarseVector> >     ThreeLevelMG;
+
+  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,12,HermIndefOp,Ddwf);
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling 2 level Multigrid            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  result=Zero();
+
+
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  ConjugateGradient<CoarseVector>  CoarseCG(0.005,1000);
+  //  SchurDiagMooeeOperator<CoarseOperator,CoarseVector> CoarseMpcDagMpc(LDOp);
+  SchurRedBlackDiagMooeeSolve<CoarseVector> CoarseRBCG(CoarseCG);
+  SolverWrapper<CoarseVector> CoarseSolver(LDOp,CoarseRBCG);
+    
+  //  NormalEquations<CoarseVector> CoarseCGNE(LDOp,CoarseCG,CoarseZeroGuesser);
+  TwoLevelMG TwoLevelPrecon(Aggregates, LDOp,
+			    HermIndefOp,Ddwf,
+			    FineSmoother,
+			    CoarseZeroGuesser,	
+			    CoarseSolver);
+  TwoLevelPrecon.Level(1);
+  PrecGeneralisedConjugateResidual<LatticeFermion> l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16);
+  l1PGCR.Level(1);
+  l1PGCR(src,result);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling CG            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  ConjugateGradient<LatticeFermion> pCG(1.0e-8,60000);
+  result=Zero();
+  //  pCG(HermDefOp,src,result);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling red black CG            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  result=Zero();
+
+    LatticeFermion    src_o(FrbGrid);
+    LatticeFermion result_o(FrbGrid);
+    pickCheckerboard(Odd,src_o,src);
+    result_o=Zero();
+    SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
+    //    pCG(HermOpEO,src_o,result_o);
+  
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Fine        PowerMethod           "<< std::endl;
+  PowerMethod<LatticeFermion>       PM;   PM(HermDefOp,src);
+  std::cout<<GridLogMessage << " Coarse       PowerMethod           "<< std::endl;
+  PowerMethod<CoarseVector>        cPM;  cPM(PosdefLdop,c_src);
+  //  std::cout<<GridLogMessage << " CoarseCoarse PowerMethod           "<< std::endl;
+  //  PowerMethod<CoarseCoarseVector> ccPM; ccPM(IRLHermOpL2,cc_src);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Grid_finalize();
+}
diff --git a/tests/solver/Test_dwf_hdcr_24_regression.cc b/tests/solver/Test_dwf_hdcr_24_regression.cc
new file mode 100644
index 00000000..180b2e00
--- /dev/null
+++ b/tests/solver/Test_dwf_hdcr_24_regression.cc
@@ -0,0 +1,477 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_hdcr.cc
+
+    Copyright (C) 2015
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+
+using namespace std;
+using namespace Grid;
+/* Params
+ * Grid: 
+ * block1(4)
+ * block2(4)
+ * 
+ * Subspace
+ * * Fine  : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100
+ * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100
+
+ * Smoother:
+ * * Fine: Cheby(hi, lo, order)            --  60,0.5,10
+ * * Coarse: Cheby(hi, lo, order)          --  12,0.1,4
+
+ * Lanczos:
+ * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order))   24,36,24,0.002,4.0,61 
+ */
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & _SmootherMatrix;
+  FineOperator   & _SmootherOperator;
+  
+  Chebyshev<Field> Cheby;
+
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) :
+    _SmootherOperator(SmootherOperator),
+    _SmootherMatrix(SmootherMatrix),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    MdagMLinearOperator<Matrix,Field>   MdagMOp(_SmootherMatrix); 
+    _SmootherOperator.AdjOp(in,tmp);
+    Cheby(MdagMOp,tmp,out);         
+  }
+};
+template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & SmootherMatrix;
+  FineOperator   & SmootherOperator;
+  RealD tol;
+  RealD shift;
+  int   maxit;
+
+  MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) :
+    shift(_shift),tol(_tol),maxit(_maxit),
+    SmootherOperator(_SmootherOperator),
+    SmootherMatrix(_SmootherMatrix)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    ZeroGuesser<Field> Guess;
+    ConjugateGradient<Field>  CG(tol,maxit,false);
+ 
+    Field src(in.Grid());
+
+    ShiftedMdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(SmootherMatrix,shift);
+    SmootherOperator.AdjOp(in,src);
+    Guess(src,out);
+    CG(MdagMOp,src,out); 
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
+class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+
+  Aggregates     & _Aggregates;
+  CoarseOperator & _CoarseOperator;
+  Matrix         & _FineMatrix;
+  FineOperator   & _FineOperator;
+  Guesser        & _Guess;
+  FineSmoother   & _Smoother;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+#define GridLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level <<" "
+
+  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
+			  FineOperator &Fine,Matrix &FineMatrix,
+			  FineSmoother &Smoother,
+			  Guesser &Guess_,
+			  CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _CoarseOperator(Coarse),
+      _FineOperator(Fine),
+      _FineMatrix(FineMatrix),
+      _Smoother(Smoother),
+      _Guess(Guess_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    CoarseVector Csrc(_CoarseOperator.Grid());
+    CoarseVector Csol(_CoarseOperator.Grid()); 
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(in,out);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    GridLogLevel << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    t+=usecond();
+    GridLogLevel << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    GridLogLevel << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(vec1,vec2);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  ///////////////////////////////////////////////////
+  // Construct a coarsened grid; utility for this?
+  ///////////////////////////////////////////////////
+  std::vector<int> block ({2,2,2,2});
+  std::vector<int> blockc ({2,2,2,2});
+  const int nbasis= 40;
+  const int nbasisc= 40;
+  auto clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/block[d];
+  }
+  auto cclatt = clatt;
+  for(int d=0;d<clatt.size();d++){
+    cclatt[d] = clatt[d]/blockc[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+  //  GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  //  GridCartesian *CoarseCoarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+  LatticeFermion    src(FGrid); gaussian(RNG5,src);// src=src+g5*src;
+  LatticeFermion result(FGrid); 
+  LatticeGaugeField Umu(UGrid); 
+
+  FieldMetaData header;
+  //  std::string file("./ckpoint_lat.4000");
+  //  std::string file("./ckpoint_lat.1000");
+  //  NerscIO::readConfiguration(Umu,header,file);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  RealD mass=0.00078;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
+  typedef CoarseOperator::CoarseVector                                 CoarseVector;
+  typedef CoarseOperator::siteVector siteVector;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
+
+  Subspace Aggregates(Coarse5d,FGrid,0);
+
+  assert ( (nbasis & 0x1)==0);
+  {
+    int nb=nbasis/2;
+    LatticeFermion A(FGrid);
+    LatticeFermion B(FGrid);
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0);
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); 
+    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,400,50,50,0.0); // Slightly faster
+
+    for(int n=0;n<nb;n++){
+      std::cout << GridLogMessage << " G5R5 "<<n<<std::endl;
+      G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
+      std::cout << GridLogMessage << " Projection "<<n<<std::endl;
+      A = Aggregates.subspace[n];
+      B = Aggregates.subspace[n+nb];
+      std::cout << GridLogMessage << " Copy "<<n<<std::endl;
+      Aggregates.subspace[n]   = A+B; // 1+G5 // eigen value of G5R5 is +1
+      std::cout << GridLogMessage << " P+ "<<n<<std::endl;
+      Aggregates.subspace[n+nb]= A-B; // 1-G5 // eigen value of G5R5 is -1
+      std::cout << GridLogMessage << " P- "<<n<<std::endl;
+    }
+  }
+  
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>    Level1Op;
+  typedef CoarsenedMatrix<siteVector,iScalar<vTComplex>,nbasisc> Level2Op;
+
+  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
+
+  
+  GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d);
+  std::cout << " Making 5D coarse RB grid " <<std::endl;
+  GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d);
+  std::cout << " Made 5D coarse RB grid " <<std::endl;
+  Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); 
+  std::cout << " LDOp.CoarsenOperator " <<std::endl;
+  LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
+  std::cout << " Coarsened Operator " <<std::endl;
+
+
+  //////////////////////////////////////////////////
+  // Deflate the course space. Recursive multigrid?
+  //////////////////////////////////////////////////
+  typedef Aggregation<siteVector,iScalar<vTComplex>,nbasisc>                   CoarseSubspace;
+  //  CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Build deflation space in coarse operator "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
+  /*
+  {
+    int nb=nbasisc/2;
+    CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,15.0,0.02,1000,800,100,0.0);
+    for(int n=0;n<nb;n++){
+      autoView( subspace   , CoarseAggregates.subspace[n],CpuWrite);
+      autoView( subspace_g5, CoarseAggregates.subspace[n+nb],CpuWrite);
+      for(int nn=0;nn<nb;nn++){
+	for(int site=0;site<Coarse5d->oSites();site++){
+	  subspace_g5[site](nn)   = subspace[site](nn);
+	  subspace_g5[site](nn+nb)=-subspace[site](nn+nb);
+	}
+      }
+    }
+  }
+  */
+  typedef Level2Op::CoarseVector CoarseCoarseVector;
+  /*
+  Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix
+  HermitianLinearOperator<Level1Op,CoarseVector> L1LinOp(LDOp);
+  L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates);
+
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Running CoarseCoarse grid Lanczos "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  MdagMLinearOperator<Level2Op,CoarseCoarseVector> IRLHermOpL2(L2Op);
+  CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0;
+  */
+  /*
+  Chebyshev<CoarseCoarseVector> IRLChebyL2(0.001,15.0,301);
+  FunctionHermOp<CoarseCoarseVector> IRLOpChebyL2(IRLChebyL2,IRLHermOpL2);
+  PlainHermOp<CoarseCoarseVector> IRLOpL2    (IRLHermOpL2);
+  int cNk=24;
+  int cNm=36;
+  int cNstop=24;
+  ImplicitlyRestartedLanczos<CoarseCoarseVector> IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20);
+
+  int cNconv;
+  std::vector<RealD>          eval2(cNm);
+  std::vector<CoarseCoarseVector>   evec2(cNm,CoarseCoarse5d);
+  IRLL2.calc(eval2,evec2,cc_src,cNconv);
+
+  ConjugateGradient<CoarseCoarseVector>  CoarseCoarseCG(0.1,1000);
+  DeflatedGuesser<CoarseCoarseVector> DeflCoarseCoarseGuesser(evec2,eval2);
+  NormalEquations<CoarseCoarseVector> DeflCoarseCoarseCGNE(L2Op,CoarseCoarseCG,DeflCoarseCoarseGuesser);
+  */
+
+  /*
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Running Coarse grid Lanczos "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  MdagMLinearOperator<Level1Op,CoarseVector> IRLHermOp(LDOp);
+  //  Chebyshev<CoarseVector>      IRLCheby(0.001,15.0,301);
+  Chebyshev<CoarseVector>      IRLCheby(0.03,12.0,101);
+  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,IRLHermOp);
+  PlainHermOp<CoarseVector>    IRLOp    (IRLHermOp);
+  int Nk=64;
+  int Nm=128;
+  int Nstop=Nk;
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20);
+
+  int Nconv;
+  std::vector<RealD>            eval(Nm);
+  std::vector<CoarseVector>     evec(Nm,Coarse5d);
+  IRL.calc(eval,evec,c_src,Nconv);
+  */
+  CoarseVector c_src(Coarse5d); c_src=1.0;
+  //  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
+  //  NormalEquations<CoarseVector> DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building 3 level Multigrid            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  //  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,DeflatedGuesser<CoarseVector> , NormalEquations<CoarseVector> >   TwoLevelMG;
+  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,ZeroGuesser<CoarseVector> , NormalEquations<CoarseVector> >   TwoLevelMG;
+  typedef MultiGridPreconditioner<siteVector,iScalar<vTComplex>,nbasisc,Level1Op, DeflatedGuesser<CoarseCoarseVector>, NormalEquations<CoarseCoarseVector> > CoarseMG;
+  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,ZeroGuesser<CoarseVector>, LinearFunction<CoarseVector> >     ThreeLevelMG;
+
+  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.25,60.0,12,HermIndefOp,Ddwf);
+  /*
+  // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space
+  ChebyshevSmoother<CoarseVector,  Level1Op >        CoarseSmoother(0.1,15.0,3,L1LinOp,LDOp);
+
+  //  MirsSmoother<CoarseVector,  Level1Op >        CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp);
+  //  MirsSmoother<LatticeFermion,DomainWallFermionR> FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf);
+
+  CoarseMG Level2Precon (CoarseAggregates, L2Op,
+			 L1LinOp,LDOp,
+			 CoarseSmoother,
+			 DeflCoarseCoarseGuesser,	
+		 DeflCoarseCoarseCGNE);
+  Level2Precon.Level(2);
+
+  // PGCR Applying this solver to solve the coarse space problem
+  PrecGeneralisedConjugateResidual<CoarseVector>  l2PGCR(0.1, 100, L1LinOp,Level2Precon,16,16);
+  l2PGCR.Level(2);
+  
+  // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  ThreeLevelMG ThreeLevelPrecon(Aggregates, LDOp,
+				HermIndefOp,Ddwf,
+				FineSmoother,
+				CoarseZeroGuesser,
+				l2PGCR);
+  ThreeLevelPrecon.Level(1);
+
+  // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid
+  PrecGeneralisedConjugateResidual<LatticeFermion> l1PGCR(1.0e-8,1000,HermIndefOp,ThreeLevelPrecon,16,16);
+  l1PGCR.Level(1);
+  */
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling 2 level Multigrid            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  result=Zero();
+
+
+    ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+    ConjugateGradient<CoarseVector>  CoarseCG(0.01,1000);
+    NormalEquations<CoarseVector> CoarseCGNE(LDOp,CoarseCG,CoarseZeroGuesser);
+    TwoLevelMG TwoLevelPrecon(Aggregates, LDOp,
+			      HermIndefOp,Ddwf,
+			      FineSmoother,
+			      CoarseZeroGuesser,	
+			      CoarseCGNE);
+    TwoLevelPrecon.Level(1);
+    PrecGeneralisedConjugateResidual<LatticeFermion> l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16);
+    l1PGCR.Level(1);
+    l1PGCR(src,result);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling CG            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  ConjugateGradient<LatticeFermion> pCG(1.0e-8,60000);
+  result=Zero();
+  //  pCG(HermDefOp,src,result);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling red black CG            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  result=Zero();
+
+    LatticeFermion    src_o(FrbGrid);
+    LatticeFermion result_o(FrbGrid);
+    pickCheckerboard(Odd,src_o,src);
+    result_o=Zero();
+    SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
+    //    pCG(HermOpEO,src_o,result_o);
+  
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Fine        PowerMethod           "<< std::endl;
+  PowerMethod<LatticeFermion>       PM;   PM(HermDefOp,src);
+  std::cout<<GridLogMessage << " Coarse       PowerMethod           "<< std::endl;
+  PowerMethod<CoarseVector>        cPM;  cPM(PosdefLdop,c_src);
+  //  std::cout<<GridLogMessage << " CoarseCoarse PowerMethod           "<< std::endl;
+  //  PowerMethod<CoarseCoarseVector> ccPM; ccPM(IRLHermOpL2,cc_src);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Grid_finalize();
+}
diff --git a/tests/solver/Test_dwf_hdcr_48_rb.cc b/tests/solver/Test_dwf_hdcr_48_rb.cc
new file mode 100644
index 00000000..a4d7bbb9
--- /dev/null
+++ b/tests/solver/Test_dwf_hdcr_48_rb.cc
@@ -0,0 +1,397 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_hdcr.cc
+
+    Copyright (C) 2015
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+
+using namespace std;
+using namespace Grid;
+/* Params
+ * Grid: 
+ * block1(4)
+ * block2(4)
+ * 
+ * Subspace
+ * * Fine  : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100
+ * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100
+
+ * Smoother:
+ * * Fine: Cheby(hi, lo, order)            --  60,0.5,10
+ * * Coarse: Cheby(hi, lo, order)          --  12,0.1,4
+
+ * Lanczos:
+ * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order))   24,36,24,0.002,4.0,61 
+ */
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+template<class Field> class SolverWrapper : public LinearFunction<Field> {
+private:
+  CheckerBoardedSparseMatrixBase<Field> & _Matrix;
+  SchurRedBlackBase<Field> & _Solver;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+  SolverWrapper(CheckerBoardedSparseMatrixBase<Field> &Matrix,
+		SchurRedBlackBase<Field> &Solver)
+   :  _Matrix(Matrix), _Solver(Solver) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    _Solver(_Matrix,in,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & _SmootherMatrix;
+  FineOperator   & _SmootherOperator;
+  
+  Chebyshev<Field> Cheby;
+
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) :
+    _SmootherOperator(SmootherOperator),
+    _SmootherMatrix(SmootherMatrix),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    MdagMLinearOperator<Matrix,Field>   MdagMOp(_SmootherMatrix); 
+    _SmootherOperator.AdjOp(in,tmp);
+    Cheby(MdagMOp,tmp,out);         
+  }
+};
+template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & SmootherMatrix;
+  FineOperator   & SmootherOperator;
+  RealD tol;
+  RealD shift;
+  int   maxit;
+
+  MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) :
+    shift(_shift),tol(_tol),maxit(_maxit),
+    SmootherOperator(_SmootherOperator),
+    SmootherMatrix(_SmootherMatrix)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    ZeroGuesser<Field> Guess;
+    ConjugateGradient<Field>  CG(tol,maxit,false);
+ 
+    Field src(in.Grid());
+
+    ShiftedMdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(SmootherMatrix,shift);
+    SmootherOperator.AdjOp(in,src);
+    Guess(src,out);
+    CG(MdagMOp,src,out); 
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
+class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+
+  Aggregates     & _Aggregates;
+  CoarseOperator & _CoarseOperator;
+  Matrix         & _FineMatrix;
+  FineOperator   & _FineOperator;
+  Guesser        & _Guess;
+  FineSmoother   & _Smoother;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+#define GridLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level <<" "
+
+  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
+			  FineOperator &Fine,Matrix &FineMatrix,
+			  FineSmoother &Smoother,
+			  Guesser &Guess_,
+			  CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _CoarseOperator(Coarse),
+      _FineOperator(Fine),
+      _FineMatrix(FineMatrix),
+      _Smoother(Smoother),
+      _Guess(Guess_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    CoarseVector Csrc(_CoarseOperator.Grid());
+    CoarseVector Csol(_CoarseOperator.Grid()); 
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(in,out);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    GridLogLevel << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    t+=usecond();
+    GridLogLevel << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    GridLogLevel << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(vec1,vec2);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  ///////////////////////////////////////////////////
+  // Construct a coarsened grid; utility for this?
+  ///////////////////////////////////////////////////
+  std::vector<int> block ({2,2,2,2});
+  //std::vector<int> block ({2,2,2,2});
+  const int nbasis= 40;
+  const int nbasisc= 40;
+  auto clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/block[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+  //  GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  //  GridCartesian *CoarseCoarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+  LatticeFermion    src(FGrid); gaussian(RNG5,src);// src=src+g5*src;
+  LatticeFermion result(FGrid); 
+  LatticeGaugeField Umu(UGrid); 
+
+  FieldMetaData header;
+  //std::string file("./ckpoint_lat.4000");
+  std::string file("./ckpoint_lat.1000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  RealD mass=0.00078;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
+  typedef CoarseOperator::CoarseVector                                 CoarseVector;
+  typedef CoarseOperator::siteVector siteVector;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
+
+  Subspace Aggregates(Coarse5d,FGrid,0);
+
+  assert ( (nbasis & 0x1)==0);
+  {
+    int nb=nbasis/2;
+    LatticeFermion A(FGrid);
+    LatticeFermion B(FGrid);
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0);
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); 
+    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,1000,100,100,0.0); // Slightly faster
+
+    for(int n=0;n<nb;n++){
+      std::cout << GridLogMessage << " G5R5 "<<n<<std::endl;
+      G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
+      std::cout << GridLogMessage << " Projection "<<n<<std::endl;
+      A = Aggregates.subspace[n];
+      B = Aggregates.subspace[n+nb];
+      std::cout << GridLogMessage << " Copy "<<n<<std::endl;
+      Aggregates.subspace[n]   = A+B; // 1+G5 // eigen value of G5R5 is +1
+      std::cout << GridLogMessage << " P+ "<<n<<std::endl;
+      Aggregates.subspace[n+nb]= A-B; // 1-G5 // eigen value of G5R5 is -1
+      std::cout << GridLogMessage << " P- "<<n<<std::endl;
+    }
+  }
+  
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>    Level1Op;
+  typedef CoarsenedMatrix<siteVector,iScalar<vTComplex>,nbasisc> Level2Op;
+
+  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
+
+  
+  GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d);
+  GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d);
+
+  Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
+
+  //////////////////////////////////////////////////
+  // Deflate the course space. Recursive multigrid?
+  //////////////////////////////////////////////////
+  typedef Aggregation<siteVector,iScalar<vTComplex>,nbasisc>                   CoarseSubspace;
+  //  CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Build deflation space in coarse operator "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
+  typedef Level2Op::CoarseVector CoarseCoarseVector;
+  CoarseVector c_src(Coarse5d); c_src=1.0;
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building 3 level Multigrid            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,ZeroGuesser<CoarseVector> , SolverWrapper<CoarseVector> >   TwoLevelMG;
+  typedef MultiGridPreconditioner<siteVector,iScalar<vTComplex>,nbasisc,Level1Op, DeflatedGuesser<CoarseCoarseVector>, NormalEquations<CoarseCoarseVector> > CoarseMG;
+  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,ZeroGuesser<CoarseVector>, LinearFunction<CoarseVector> >     ThreeLevelMG;
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling 2 level Multigrid            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  std::vector<RealD> tols({0.015});
+  std::vector<int>   ords({12});
+  std::vector<RealD>   los({0.8});
+  for(int l=0;l<los.size();l++){
+  for(int o=0;o<ords.size();o++){
+  for(int t=0;t<tols.size();t++){
+    result=Zero();
+
+    std::cout << GridLogMessage <<" tol  " << tols[t] << " cheby order " <<ords[o]<< " lo "<<los[l] <<std::endl;
+    ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(los[l],60.0,ords[o],HermIndefOp,Ddwf);
+    ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+    ConjugateGradient<CoarseVector>  CoarseCG(tols[t],10000);
+    SchurRedBlackDiagMooeeSolve<CoarseVector> CoarseRBCG(CoarseCG);
+    SolverWrapper<CoarseVector> CoarseSolver(LDOp,CoarseRBCG);
+    
+    TwoLevelMG TwoLevelPrecon(Aggregates, LDOp,
+			      HermIndefOp,Ddwf,
+			      FineSmoother,
+			      CoarseZeroGuesser,	
+			      CoarseSolver);
+    TwoLevelPrecon.Level(1);
+    PrecGeneralisedConjugateResidual<LatticeFermion> l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16);
+    l1PGCR.Level(1);
+    l1PGCR(src,result);
+  }}}
+
+  ConjugateGradient<LatticeFermion> pCG(1.0e-8,60000);
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling red black CG            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  result=Zero();
+
+    LatticeFermion    src_o(FrbGrid);
+    LatticeFermion result_o(FrbGrid);
+    pickCheckerboard(Odd,src_o,src);
+    result_o=Zero();
+    SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
+    pCG(HermOpEO,src_o,result_o);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling CG            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  result=Zero();
+  pCG(HermDefOp,src,result);
+  
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Fine        PowerMethod           "<< std::endl;
+  PowerMethod<LatticeFermion>       PM;   PM(HermDefOp,src);
+  std::cout<<GridLogMessage << " Coarse       PowerMethod           "<< std::endl;
+  PowerMethod<CoarseVector>        cPM;  cPM(PosdefLdop,c_src);
+  //  std::cout<<GridLogMessage << " CoarseCoarse PowerMethod           "<< std::endl;
+  //  PowerMethod<CoarseCoarseVector> ccPM; ccPM(IRLHermOpL2,cc_src);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Grid_finalize();
+}
diff --git a/tests/solver/Test_dwf_hdcr_48_regression.cc b/tests/solver/Test_dwf_hdcr_48_regression.cc
new file mode 100644
index 00000000..d07bd3a5
--- /dev/null
+++ b/tests/solver/Test_dwf_hdcr_48_regression.cc
@@ -0,0 +1,473 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_hdcr.cc
+
+    Copyright (C) 2015
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+
+using namespace std;
+using namespace Grid;
+/* Params
+ * Grid: 
+ * block1(4)
+ * block2(4)
+ * 
+ * Subspace
+ * * Fine  : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100
+ * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100
+
+ * Smoother:
+ * * Fine: Cheby(hi, lo, order)            --  60,0.5,10
+ * * Coarse: Cheby(hi, lo, order)          --  12,0.1,4
+
+ * Lanczos:
+ * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order))   24,36,24,0.002,4.0,61 
+ */
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & _SmootherMatrix;
+  FineOperator   & _SmootherOperator;
+  
+  Chebyshev<Field> Cheby;
+
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) :
+    _SmootherOperator(SmootherOperator),
+    _SmootherMatrix(SmootherMatrix),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    MdagMLinearOperator<Matrix,Field>   MdagMOp(_SmootherMatrix); 
+    _SmootherOperator.AdjOp(in,tmp);
+    Cheby(MdagMOp,tmp,out);         
+  }
+};
+template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & SmootherMatrix;
+  FineOperator   & SmootherOperator;
+  RealD tol;
+  RealD shift;
+  int   maxit;
+
+  MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) :
+    shift(_shift),tol(_tol),maxit(_maxit),
+    SmootherOperator(_SmootherOperator),
+    SmootherMatrix(_SmootherMatrix)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    ZeroGuesser<Field> Guess;
+    ConjugateGradient<Field>  CG(tol,maxit,false);
+ 
+    Field src(in.Grid());
+
+    ShiftedMdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(SmootherMatrix,shift);
+    SmootherOperator.AdjOp(in,src);
+    Guess(src,out);
+    CG(MdagMOp,src,out); 
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
+class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+
+  Aggregates     & _Aggregates;
+  CoarseOperator & _CoarseOperator;
+  Matrix         & _FineMatrix;
+  FineOperator   & _FineOperator;
+  Guesser        & _Guess;
+  FineSmoother   & _Smoother;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+#define GridLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level <<" "
+
+  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
+			  FineOperator &Fine,Matrix &FineMatrix,
+			  FineSmoother &Smoother,
+			  Guesser &Guess_,
+			  CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _CoarseOperator(Coarse),
+      _FineOperator(Fine),
+      _FineMatrix(FineMatrix),
+      _Smoother(Smoother),
+      _Guess(Guess_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    CoarseVector Csrc(_CoarseOperator.Grid());
+    CoarseVector Csol(_CoarseOperator.Grid()); 
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(in,out);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    GridLogLevel << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    t+=usecond();
+    GridLogLevel << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    GridLogLevel << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(vec1,vec2);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  ///////////////////////////////////////////////////
+  // Construct a coarsened grid; utility for this?
+  ///////////////////////////////////////////////////
+  std::vector<int> block ({2,2,2,2});
+  std::vector<int> blockc ({2,2,2,2});
+  const int nbasis= 40;
+  const int nbasisc= 40;
+  auto clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/block[d];
+  }
+  auto cclatt = clatt;
+  for(int d=0;d<clatt.size();d++){
+    cclatt[d] = clatt[d]/blockc[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+  //  GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  //  GridCartesian *CoarseCoarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+  LatticeFermion    src(FGrid); gaussian(RNG5,src);// src=src+g5*src;
+  LatticeFermion result(FGrid); 
+  LatticeGaugeField Umu(UGrid); 
+
+  FieldMetaData header;
+  //  std::string file("./ckpoint_lat.4000");
+  std::string file("./ckpoint_lat.1000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  RealD mass=0.00078;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
+  typedef CoarseOperator::CoarseVector                                 CoarseVector;
+  typedef CoarseOperator::siteVector siteVector;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
+
+  Subspace Aggregates(Coarse5d,FGrid,0);
+
+  assert ( (nbasis & 0x1)==0);
+  {
+    int nb=nbasis/2;
+    LatticeFermion A(FGrid);
+    LatticeFermion B(FGrid);
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0);
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); 
+    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,1000,100,100,0.0); // Slightly faster
+
+    for(int n=0;n<nb;n++){
+      std::cout << GridLogMessage << " G5R5 "<<n<<std::endl;
+      G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
+      std::cout << GridLogMessage << " Projection "<<n<<std::endl;
+      A = Aggregates.subspace[n];
+      B = Aggregates.subspace[n+nb];
+      std::cout << GridLogMessage << " Copy "<<n<<std::endl;
+      Aggregates.subspace[n]   = A+B; // 1+G5 // eigen value of G5R5 is +1
+      std::cout << GridLogMessage << " P+ "<<n<<std::endl;
+      Aggregates.subspace[n+nb]= A-B; // 1-G5 // eigen value of G5R5 is -1
+      std::cout << GridLogMessage << " P- "<<n<<std::endl;
+    }
+  }
+  
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>    Level1Op;
+  typedef CoarsenedMatrix<siteVector,iScalar<vTComplex>,nbasisc> Level2Op;
+
+  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
+
+  
+  GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d);
+  std::cout << " Making 5D coarse RB grid " <<std::endl;
+  GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d);
+  std::cout << " Made 5D coarse RB grid " <<std::endl;
+  Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
+
+
+  //////////////////////////////////////////////////
+  // Deflate the course space. Recursive multigrid?
+  //////////////////////////////////////////////////
+  typedef Aggregation<siteVector,iScalar<vTComplex>,nbasisc>                   CoarseSubspace;
+  //  CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Build deflation space in coarse operator "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
+  /*
+  {
+    int nb=nbasisc/2;
+    CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,15.0,0.02,1000,800,100,0.0);
+    for(int n=0;n<nb;n++){
+      autoView( subspace   , CoarseAggregates.subspace[n],CpuWrite);
+      autoView( subspace_g5, CoarseAggregates.subspace[n+nb],CpuWrite);
+      for(int nn=0;nn<nb;nn++){
+	for(int site=0;site<Coarse5d->oSites();site++){
+	  subspace_g5[site](nn)   = subspace[site](nn);
+	  subspace_g5[site](nn+nb)=-subspace[site](nn+nb);
+	}
+      }
+    }
+  }
+  */
+  typedef Level2Op::CoarseVector CoarseCoarseVector;
+  /*
+  Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix
+  HermitianLinearOperator<Level1Op,CoarseVector> L1LinOp(LDOp);
+  L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates);
+
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Running CoarseCoarse grid Lanczos "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  MdagMLinearOperator<Level2Op,CoarseCoarseVector> IRLHermOpL2(L2Op);
+  CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0;
+  */
+  /*
+  Chebyshev<CoarseCoarseVector> IRLChebyL2(0.001,15.0,301);
+  FunctionHermOp<CoarseCoarseVector> IRLOpChebyL2(IRLChebyL2,IRLHermOpL2);
+  PlainHermOp<CoarseCoarseVector> IRLOpL2    (IRLHermOpL2);
+  int cNk=24;
+  int cNm=36;
+  int cNstop=24;
+  ImplicitlyRestartedLanczos<CoarseCoarseVector> IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20);
+
+  int cNconv;
+  std::vector<RealD>          eval2(cNm);
+  std::vector<CoarseCoarseVector>   evec2(cNm,CoarseCoarse5d);
+  IRLL2.calc(eval2,evec2,cc_src,cNconv);
+
+  ConjugateGradient<CoarseCoarseVector>  CoarseCoarseCG(0.1,1000);
+  DeflatedGuesser<CoarseCoarseVector> DeflCoarseCoarseGuesser(evec2,eval2);
+  NormalEquations<CoarseCoarseVector> DeflCoarseCoarseCGNE(L2Op,CoarseCoarseCG,DeflCoarseCoarseGuesser);
+  */
+
+  /*
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Running Coarse grid Lanczos "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  MdagMLinearOperator<Level1Op,CoarseVector> IRLHermOp(LDOp);
+  //  Chebyshev<CoarseVector>      IRLCheby(0.001,15.0,301);
+  Chebyshev<CoarseVector>      IRLCheby(0.03,12.0,101);
+  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,IRLHermOp);
+  PlainHermOp<CoarseVector>    IRLOp    (IRLHermOp);
+  int Nk=64;
+  int Nm=128;
+  int Nstop=Nk;
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20);
+
+  int Nconv;
+  std::vector<RealD>            eval(Nm);
+  std::vector<CoarseVector>     evec(Nm,Coarse5d);
+  IRL.calc(eval,evec,c_src,Nconv);
+  */
+  CoarseVector c_src(Coarse5d); c_src=1.0;
+  //  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
+  //  NormalEquations<CoarseVector> DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building 3 level Multigrid            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  //  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,DeflatedGuesser<CoarseVector> , NormalEquations<CoarseVector> >   TwoLevelMG;
+  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,ZeroGuesser<CoarseVector> , NormalEquations<CoarseVector> >   TwoLevelMG;
+  typedef MultiGridPreconditioner<siteVector,iScalar<vTComplex>,nbasisc,Level1Op, DeflatedGuesser<CoarseCoarseVector>, NormalEquations<CoarseCoarseVector> > CoarseMG;
+  typedef MultiGridPreconditioner<vSpinColourVector,  vTComplex,nbasis, DomainWallFermionR,ZeroGuesser<CoarseVector>, LinearFunction<CoarseVector> >     ThreeLevelMG;
+
+  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.25,60.0,12,HermIndefOp,Ddwf);
+  /*
+  // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space
+  ChebyshevSmoother<CoarseVector,  Level1Op >        CoarseSmoother(0.1,15.0,3,L1LinOp,LDOp);
+
+  //  MirsSmoother<CoarseVector,  Level1Op >        CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp);
+  //  MirsSmoother<LatticeFermion,DomainWallFermionR> FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf);
+
+  CoarseMG Level2Precon (CoarseAggregates, L2Op,
+			 L1LinOp,LDOp,
+			 CoarseSmoother,
+			 DeflCoarseCoarseGuesser,	
+		 DeflCoarseCoarseCGNE);
+  Level2Precon.Level(2);
+
+  // PGCR Applying this solver to solve the coarse space problem
+  PrecGeneralisedConjugateResidual<CoarseVector>  l2PGCR(0.1, 100, L1LinOp,Level2Precon,16,16);
+  l2PGCR.Level(2);
+  
+  // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  ThreeLevelMG ThreeLevelPrecon(Aggregates, LDOp,
+				HermIndefOp,Ddwf,
+				FineSmoother,
+				CoarseZeroGuesser,
+				l2PGCR);
+  ThreeLevelPrecon.Level(1);
+
+  // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid
+  PrecGeneralisedConjugateResidual<LatticeFermion> l1PGCR(1.0e-8,1000,HermIndefOp,ThreeLevelPrecon,16,16);
+  l1PGCR.Level(1);
+  */
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling 2 level Multigrid            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  result=Zero();
+
+
+    ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+    ConjugateGradient<CoarseVector>  CoarseCG(0.01,1000);
+    NormalEquations<CoarseVector> CoarseCGNE(LDOp,CoarseCG,CoarseZeroGuesser);
+    TwoLevelMG TwoLevelPrecon(Aggregates, LDOp,
+			      HermIndefOp,Ddwf,
+			      FineSmoother,
+			      CoarseZeroGuesser,	
+			      CoarseCGNE);
+    TwoLevelPrecon.Level(1);
+    PrecGeneralisedConjugateResidual<LatticeFermion> l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16);
+    l1PGCR.Level(1);
+    l1PGCR(src,result);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling CG            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  ConjugateGradient<LatticeFermion> pCG(1.0e-8,60000);
+  result=Zero();
+  //  pCG(HermDefOp,src,result);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling red black CG            "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  result=Zero();
+
+    LatticeFermion    src_o(FrbGrid);
+    LatticeFermion result_o(FrbGrid);
+    pickCheckerboard(Odd,src_o,src);
+    result_o=Zero();
+    SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
+    pCG(HermOpEO,src_o,result_o);
+  
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Fine        PowerMethod           "<< std::endl;
+  PowerMethod<LatticeFermion>       PM;   PM(HermDefOp,src);
+  std::cout<<GridLogMessage << " Coarse       PowerMethod           "<< std::endl;
+  PowerMethod<CoarseVector>        cPM;  cPM(PosdefLdop,c_src);
+  //  std::cout<<GridLogMessage << " CoarseCoarse PowerMethod           "<< std::endl;
+  //  PowerMethod<CoarseCoarseVector> ccPM; ccPM(IRLHermOpL2,cc_src);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Grid_finalize();
+}
diff --git a/tests/solver/Test_hw_multigrid_mixed_48.cc b/tests/solver/Test_hw_multigrid_mixed_48.cc
new file mode 100644
index 00000000..6b3b5f56
--- /dev/null
+++ b/tests/solver/Test_hw_multigrid_mixed_48.cc
@@ -0,0 +1,1287 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_hdcr.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+// TODO
+//
+// Coarse Grid axpby_ssp_pminus // Inherit from spProj5pm
+// Coarse Grid axpby_ssp_pplus
+
+template<class Field,class Coeff_t>
+class CayleyBase : public SparseMatrixBase<Field> 
+{
+public:
+  int Ls;
+  //    protected:
+  RealD mass;
+  RealD M5;
+  // Save arguments to SetCoefficientsInternal
+  Vector<Coeff_t> _gamma;
+  RealD                _zolo_hi;
+  RealD                _b;
+  RealD                _c;
+
+  // Cayley form Moebius (tanh and zolotarev)
+  Vector<Coeff_t> omega;
+  Vector<Coeff_t> bs;    // S dependent coeffs
+  Vector<Coeff_t> cs;
+  Vector<Coeff_t> as;
+  // For preconditioning Cayley form
+  Vector<Coeff_t> bee;
+  Vector<Coeff_t> cee;
+  Vector<Coeff_t> aee;
+  Vector<Coeff_t> beo;
+  Vector<Coeff_t> ceo;
+  Vector<Coeff_t> aeo;
+  // LDU factorisation of the eeoo matrix
+  Vector<Coeff_t> lee;
+  Vector<Coeff_t> leem;
+  Vector<Coeff_t> uee;
+  Vector<Coeff_t> ueem;
+  Vector<Coeff_t> dee;
+public:
+  CayleyBase(RealD _M5, RealD _mass, int _Ls, RealD b_, RealD c_) :
+    M5(_M5),
+    mass(_mass),
+    Ls(_Ls),
+    _b(b_),
+    _c(c_)
+  {
+    RealD eps = 1.0;
+    Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
+    this->SetCoefficientsTanh(zdata,1.0,0.0);
+    Approx::zolotarev_free(zdata);
+  }
+  /////////////////////////////////////////////////////////
+  // Replicates functionality
+  // Use a common base class approach
+  /////////////////////////////////////////////////////////
+  // Tanh
+  void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
+  {
+    Vector<Coeff_t> gamma(this->Ls);
+    for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+    SetCoefficientsInternal(1.0,gamma,b,c);
+  }
+  //Zolo
+  void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+  {
+    Vector<Coeff_t> gamma(this->Ls);
+    for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+    SetCoefficientsInternal(zolo_hi,gamma,b,c);
+  }
+  //Zolo
+  void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
+  {
+    int Ls=this->Ls;
+
+    ///////////////////////////////////////////////////////////
+    // The Cayley coeffs (unprec)
+    ///////////////////////////////////////////////////////////
+    assert(gamma.size()==Ls);
+
+    omega.resize(Ls);
+    bs.resize(Ls);
+    cs.resize(Ls);
+    as.resize(Ls);
+    
+    double bpc = b+c;
+    double bmc = b-c;
+    _b = b;
+    _c = c;
+    _gamma  = gamma; // Save the parameters so we can change mass later.
+    _zolo_hi= zolo_hi;
+    for(int i=0; i < Ls; i++){
+      as[i] = 1.0;
+      omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code
+      assert(omega[i]!=Coeff_t(0.0));
+      bs[i] = 0.5*(bpc/omega[i] + bmc);
+      cs[i] = 0.5*(bpc/omega[i] - bmc);
+    }
+
+    ////////////////////////////////////////////////////////
+    // Constants for the preconditioned matrix Cayley form
+    ////////////////////////////////////////////////////////
+    bee.resize(Ls);
+    cee.resize(Ls);
+    beo.resize(Ls);
+    ceo.resize(Ls);
+    
+    for(int i=0;i<Ls;i++){
+      bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
+      assert(bee[i]!=Coeff_t(0.0));
+      cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
+      beo[i]=as[i]*bs[i];
+      ceo[i]=-as[i]*cs[i];
+    }
+    aee.resize(Ls);
+    aeo.resize(Ls);
+    for(int i=0;i<Ls;i++){
+      aee[i]=cee[i];
+      aeo[i]=ceo[i];
+    }
+    
+    //////////////////////////////////////////
+    // LDU decomposition of eeoo
+    //////////////////////////////////////////
+    dee.resize(Ls);
+    lee.resize(Ls);
+    leem.resize(Ls);
+    uee.resize(Ls);
+    ueem.resize(Ls);
+  
+    for(int i=0;i<Ls;i++){
+      
+      dee[i] = bee[i];
+      
+      if ( i < Ls-1 ) {
+	
+	assert(bee[i]!=Coeff_t(0.0));
+	assert(bee[0]!=Coeff_t(0.0));
+      
+	lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
+      
+	leem[i]=mass*cee[Ls-1]/bee[0];
+	for(int j=0;j<i;j++) {
+	  assert(bee[j+1]!=Coeff_t(0.0));
+	  leem[i]*= aee[j]/bee[j+1];
+	}
+      
+	uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
+	
+	ueem[i]=mass;
+	for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
+	ueem[i]*= aee[0]/bee[0];
+      
+      } else { 
+	lee[i] =0.0;
+	leem[i]=0.0;
+	uee[i] =0.0;
+	ueem[i]=0.0;
+      }
+    }
+    
+    { 
+      Coeff_t delta_d=mass*cee[Ls-1];
+      for(int j=0;j<Ls-1;j++) {
+	assert(bee[j] != Coeff_t(0.0));
+	delta_d *= cee[j]/bee[j];
+      }
+      dee[Ls-1] += delta_d;
+    }  
+  };
+
+  //////////////////////////////
+  // M and Mdag
+  //////////////////////////////
+  virtual  void Mdiag    (const Field &in, Field &out) {assert(0);}
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
+  virtual  void DW       (const Field &psi, Field &chi)=0;
+  virtual  void DWDag    (const Field &psi, Field &chi)=0;
+
+  void M    (const Field &psi, Field &chi)
+  {
+    Field Din(psi.Grid());
+    Meooe5D(psi,Din);
+    DW(Din,chi);
+    axpby(chi,1.0,1.0,chi,psi); 
+    M5D(psi,chi);
+  }
+  void Mdag (const Field &psi, Field &chi)
+  {
+    Field Din(psi.Grid());
+    DWDag(psi,Din); 
+    MeooeDag5D(Din,chi);
+    M5Ddag(psi,chi);
+    axpby (chi,1.0,1.0,chi,psi); 
+  }
+  /////////////////////////////////
+  // P and Pdag - might be needed
+  /////////////////////////////////
+  void P(const Field &psi, Field &chi)
+  {
+    int Ls= this->Ls;
+    chi=Zero();
+    for(int s=0;s<Ls;s++){
+      axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
+      axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s+1)%Ls);
+    }
+  }
+  void Pdag(const Field &psi, Field &chi)
+  {
+    int Ls= this->Ls;
+    chi=Zero();
+    for(int s=0;s<Ls;s++){
+      axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
+      axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s-1+Ls)%Ls);
+    }
+  }
+  ////////////////////////////////////////////////////////
+  // Depends: Dw, M5D,  M5Ddag, Meooe5D, MeooeDag5D,
+  ////////////////////////////////////////////////////////
+  void M5D   (const Field &psi, Field &chi)
+  {
+    int Ls=this->Ls;
+    Vector<Coeff_t> diag (Ls,1.0);
+    Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+    Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
+    M5D(psi,chi,chi,lower,diag,upper);
+  }
+  void M5Ddag (const Field &psi, Field &chi)
+  {
+    int Ls=this->Ls;
+    Vector<Coeff_t> diag(Ls,1.0);
+    Vector<Coeff_t> upper(Ls,-1.0);
+    Vector<Coeff_t> lower(Ls,-1.0);
+    upper[Ls-1]=-mass*upper[Ls-1];
+    lower[0]   =-mass*lower[0];
+    M5Ddag(psi,chi,chi,lower,diag,upper);
+  }
+  void Meooe5D    (const Field &psi, Field &Din)
+  {
+    int Ls=this->Ls;
+    Vector<Coeff_t> diag = bs;
+    Vector<Coeff_t> upper= cs;
+    Vector<Coeff_t> lower= cs; 
+    upper[Ls-1]=-mass*upper[Ls-1];
+    lower[0]   =-mass*lower[0];
+    M5D(psi,psi,Din,lower,diag,upper);
+  }
+  void MeooeDag5D    (const Field &psi, Field &Din)
+  {
+    int Ls=this->Ls;
+    Vector<Coeff_t> diag =bs;
+    Vector<Coeff_t> upper=cs;
+    Vector<Coeff_t> lower=cs; 
+    
+    for (int s=0;s<Ls;s++){
+      if ( s== 0 ) {
+	upper[s] = cs[s+1];
+	lower[s] =-mass*cs[Ls-1];
+      } else if ( s==(Ls-1) ) { 
+	upper[s] =-mass*cs[0];
+	lower[s] = cs[s-1];
+      } else { 
+	upper[s] = cs[s+1];
+	lower[s] = cs[s-1];
+      }
+      upper[s] = conjugate(upper[s]);
+      lower[s] = conjugate(lower[s]);
+      diag[s]  = conjugate(diag[s]);
+    }
+    M5Ddag(psi,psi,Din,lower,diag,upper);
+  }
+
+  void M5D(const Field &psi_i,
+	   const Field &phi_i, 
+	   Field &chi_i,
+	   Vector<Coeff_t> &lower,
+	   Vector<Coeff_t> &diag,
+	   Vector<Coeff_t> &upper)
+  {
+    chi_i.Checkerboard()=psi_i.Checkerboard();
+    GridBase *grid=psi_i.Grid();
+    autoView(psi , psi_i,AcceleratorRead);
+    autoView(phi , phi_i,AcceleratorRead);
+    autoView(chi , chi_i,AcceleratorWrite);
+    assert(phi.Checkerboard() == psi.Checkerboard());
+
+    auto pdiag = &diag[0];
+    auto pupper = &upper[0];
+    auto plower = &lower[0];
+
+    int Ls =this->Ls;
+    
+    // 10 = 3 complex mult + 2 complex add
+    // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
+    uint64_t nloop = grid->oSites()/Ls;
+
+    const int Nsimd = Field::vector_type::Nsimd();
+    accelerator_for(sss,nloop,Nsimd,{
+	uint64_t ss= sss*Ls;
+	typedef decltype(coalescedRead(psi[0])) spinor;
+	spinor tmp1, tmp2;
+	for(int s=0;s<Ls;s++){
+	  uint64_t idx_u = ss+((s+1)%Ls);
+	  uint64_t idx_l = ss+((s+Ls-1)%Ls);
+	  spProj5m(tmp1,psi(idx_u)); // Need routines for this
+	  spProj5p(tmp2,psi(idx_l));
+	  coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
+	}
+      });
+  }
+  void M5Ddag(const Field &psi_i,
+	      const Field &phi_i, 
+	      Field &chi_i,
+	      Vector<Coeff_t> &lower,
+	      Vector<Coeff_t> &diag,
+	      Vector<Coeff_t> &upper)
+  {
+    chi_i.Checkerboard()=psi_i.Checkerboard();
+    GridBase *grid=psi_i.Grid();
+    autoView(psi , psi_i,AcceleratorRead);
+    autoView(phi , phi_i,AcceleratorRead);
+    autoView(chi , chi_i,AcceleratorWrite);
+    assert(phi.Checkerboard() == psi.Checkerboard());
+    
+    auto pdiag = &diag[0];
+    auto pupper = &upper[0];
+    auto plower = &lower[0];
+    
+    int Ls=this->Ls;
+    
+    uint64_t nloop = grid->oSites()/Ls;
+    const int Nsimd = Field::vector_type::Nsimd();
+    accelerator_for(sss,nloop,Nsimd,{
+	uint64_t ss=sss*Ls;
+	typedef decltype(coalescedRead(psi[0])) spinor;
+	spinor tmp1,tmp2;
+	for(int s=0;s<Ls;s++){
+	  uint64_t idx_u = ss+((s+1)%Ls);
+	  uint64_t idx_l = ss+((s+Ls-1)%Ls);
+	  spProj5p(tmp1,psi(idx_u));
+	  spProj5m(tmp2,psi(idx_l));
+	  coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
+	}
+      });
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis>
+class CoarseCayleyFermion  : public CayleyBase< Lattice<iVector<CComplex,nbasis > > , ComplexD >
+{
+public:
+  typedef iVector<CComplex,nbasis >           siteVector;
+  typedef Lattice<CComplex >                  CoarseComplexField;
+  typedef Lattice<siteVector>                 CoarseVector;
+  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+  typedef iMatrix<CComplex,nbasis >  Cobj;
+  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj >        FineField;
+
+  // Similar to the CoarseOperator but add 5D support.
+  Geometry  geom;
+  GridBase *Coarse5D;
+  GridBase *Coarse4D;
+  CartesianStencil<siteVector,siteVector,int> Stencil; 
+  CoarsenedMatrix<Fobj,CComplex,nbasis> &Dw;
+
+  GridBase * Grid(void)         { return Coarse5D; };   // this is all the linalg routines need to know
+
+  CoarseCayleyFermion(GridCartesian &CoarseGrid4,
+		      GridCartesian &CoarseGrid5,
+		      CoarsenedMatrix<Fobj,CComplex,nbasis> &_Dw,
+		      RealD M5, RealD mass, int Ls, RealD b, RealD c) :
+    CayleyBase<CoarseVector,ComplexD>(M5,mass,Ls,b,c),
+    Coarse4D(&CoarseGrid4),
+    Coarse5D(&CoarseGrid5),
+    Dw(_Dw),
+    geom(CoarseGrid5._ndimension),
+    Stencil( &CoarseGrid5,geom.npoint,Even,geom.directions,geom.displacements,0)
+  { 
+  };
+
+public:
+  void Project( CoarseVector &C )
+  {
+    const int Nsimd = CComplex::Nsimd();
+    autoView(Cv,C, AcceleratorWrite);
+    int Ls = this->Ls;
+    for(int s=0;s<Ls;s++){
+      accelerator_for(sU, Coarse4D->oSites(), Nsimd, {
+	  int sF= sU*Ls+s;
+	  auto tmp = coalescedRead(Cv[sF]);
+	  coalescedWrite(Cv[sF],tmp);
+      });
+    }
+  }
+  ////////////////////////////////////////////////
+  // This is specific to Coarse Grid Cayley
+  ////////////////////////////////////////////////
+  virtual  void Mdiag    (const CoarseVector &in, CoarseVector &out)
+  {
+    std::vector<CoarseVector> allout(9,in.Grid());
+    this->MdirAll(in,allout);
+    out = allout[8];
+  }
+  virtual  void Mdir     (const CoarseVector &in, CoarseVector &out,int dir, int disp)
+  {
+    assert(0);
+  }
+  virtual  void MdirAll  (const CoarseVector &in, std::vector<CoarseVector> &out)
+  {
+    conformable(Coarse5D,in.Grid());
+
+    SimpleCompressor<siteVector> compressor;
+
+    Stencil.HaloExchange(in,compressor);
+    typedef LatticeView<Cobj> Aview;
+      
+    const int Nsimd = CComplex::Nsimd();
+    
+    // Ls loop for2D
+    int Ls=this->Ls;
+
+    siteVector *CBp=Stencil.CommBuf();			
+
+    int ptype;
+    int nb2=nbasis/2;
+    
+    autoView(in_v ,   in, AcceleratorRead);
+    autoView(st, Stencil, AcceleratorRead);
+    for(int point=0;point<geom.npoint;point++){
+      
+      autoView(out_v,  out[point], AcceleratorWrite);
+      autoView(Aview,Dw.A[point],AcceleratorRead);
+
+      accelerator_for2d(sF, Coarse5D->oSites(), b, nbasis, Nsimd, {
+
+	  typedef decltype(coalescedRead(in_v[0])) calcVector;
+	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+	  int sU = sF/Ls;
+	  int  s = sF%Ls;
+
+	  calcComplex res = Zero();
+	  calcVector  nbr;
+	  int ptype;
+	    
+	  StencilEntry *SE=st.GetEntry(ptype,point,sF);
+	  
+	  if(SE->_is_local) { 
+	    nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+	  } else {
+	    nbr = coalescedRead(CBp[SE->_offset]);
+	  }
+	  acceleratorSynchronise();
+
+	  for(int bb=0;bb<nbasis;bb++) {
+	    res = res + coalescedRead(Aview[sU](b,bb))*nbr(bb);
+	  }
+	  
+	  coalescedWrite(out_v[sF](b),res);
+      });
+    }      
+  }
+  void DW (const CoarseVector &in, CoarseVector &out)
+  {
+    conformable(Coarse5D,in.Grid());
+    conformable(in.Grid(),out.Grid());
+
+    SimpleCompressor<siteVector> compressor;
+
+    Stencil.HaloExchange(in,compressor);
+    typedef LatticeView<Cobj> Aview;
+      
+    const int Nsimd = CComplex::Nsimd();
+    
+    // Ls loop for2D
+    int Ls=this->Ls;
+
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(Dw.A[p].View(AcceleratorRead));
+    Aview *Aview_p = & AcceleratorViewContainer[0];
+    autoView(in_v ,   in, AcceleratorRead);
+    autoView(out_v,  out, AcceleratorWrite);
+    autoView(st, Stencil, AcceleratorRead);
+    siteVector *CBp=Stencil.CommBuf();			
+
+    int ptype;
+    int nb2=nbasis/2;
+    accelerator_for2d(sF, Coarse5D->oSites(), b, nbasis, Nsimd, {
+
+      typedef decltype(coalescedRead(in_v[0])) calcVector;
+      typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+      int sU = sF/Ls;
+      int  s = sF%Ls;
+
+      calcComplex res = Zero();
+
+      {
+      	calcVector  nbr;
+	int ptype;
+
+	for(int point=0;point<geom.npoint;point++){
+
+	  StencilEntry *SE=st.GetEntry(ptype,point,sF);
+	  
+	  if(SE->_is_local) { 
+	    nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+	  } else {
+	    nbr = coalescedRead(CBp[SE->_offset]);
+	  }
+	  acceleratorSynchronise();
+
+	  for(int bb=0;bb<nbasis;bb++) {
+	    res = res + coalescedRead(Aview_p[point][sU](b,bb))*nbr(bb);
+	  }	  
+	}
+      }
+      coalescedWrite(out_v[sF](b),res);
+      });
+      
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+  };
+
+  void DWDag (const CoarseVector &in, CoarseVector &out)
+  {
+    // Inefficient G5 hermitian use
+    CoarseVector tmp(Grid());
+    G5C(tmp, in); //There has to be a better way
+    DW(tmp, out);
+    G5C(out, out);
+  };
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+
+  void PromoteFromSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) 
+  {
+    auto FineGrid4 = _Aggregates.FineGrid;
+    FineField F4(FineGrid4);
+    CoarseVector C4(Coarse4D);
+    for(int s=0;s<this->Ls;s++){
+      ExtractSlice(C4,C,s,0);
+      _Aggregates.PromoteFromSubspace(C4,F4); 
+      InsertSlice(F4,F,s,0);
+    }      
+  }
+  void ProjectToSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) 
+  {
+    auto FineGrid4 = _Aggregates.FineGrid;
+    FineField F4(FineGrid4);
+    CoarseVector C4(Coarse4D);
+    for(int s=0;s<this->Ls;s++){
+      ExtractSlice(F4,F,s,0);
+      _Aggregates.ProjectToSubspace  (C4,F4);
+      InsertSlice(C4,C,s,0);
+    }
+    Project(C);
+  }
+  template<class Ddwf>
+  void Test(Aggregates &_Aggregates,GridBase *FineGrid, Ddwf &_Ddwf)
+  {
+    typedef Lattice<Fobj> FineField;
+    CoarseVector Cin(Coarse5D);
+    CoarseVector Cout(Coarse5D);
+    CoarseVector CFout(Coarse5D);
+
+    FineField Fin(FineGrid);
+    FineField Fout(FineGrid);
+
+
+    std::vector<int> seeds({1,2,3,4,5});
+    GridParallelRNG RNG(Coarse5D);  RNG.SeedFixedIntegers(seeds);
+
+    gaussian(RNG,Cin);
+    PromoteFromSubspace(_Aggregates,Cin,Fin);
+    ProjectToSubspace(_Aggregates,Cin,Fin);
+
+    std::cout << GridLogMessage<< "************  "<<std::endl;
+    std::cout << GridLogMessage<< " Testing M  "<<std::endl;
+    std::cout << GridLogMessage<< "************  "<<std::endl;
+    // Coarse operator
+    this->M(Cin,Cout);
+    this->Project(Cout);
+    std::cout << GridLogMessage<< " Cout  "<<norm2(Cout)<<std::endl;
+
+    // Fine projected operator
+    PromoteFromSubspace(_Aggregates,Cin,Fin);
+    _Ddwf.M(Fin,Fout);
+    ProjectToSubspace(_Aggregates,CFout,Fout);
+    std::cout << GridLogMessage<< " CFout "<<norm2(CFout)<<std::endl;
+    CFout = CFout-Cout;
+    std::cout << GridLogMessage<< " diff  "<<norm2(CFout)<<std::endl;
+
+    std::cout << GridLogMessage<< "************  "<<std::endl;
+    std::cout << GridLogMessage<< " Testing Mdag  "<<std::endl;
+    std::cout << GridLogMessage<< "************  "<<std::endl;
+    // Coarse operator
+    this->Mdag(Cin,Cout);
+    this->Project(Cout);
+    std::cout << GridLogMessage<< " Cout  "<<norm2(Cout)<<std::endl;
+
+    // Fine operator
+    _Ddwf.Mdag(Fin,Fout);
+    ProjectToSubspace(_Aggregates,CFout,Fout);
+    std::cout << GridLogMessage<< " CFout "<<norm2(CFout)<<std::endl;
+    CFout = CFout-Cout;
+    std::cout << GridLogMessage<< " diff  "<<norm2(CFout)<<std::endl;
+ 
+  }
+  virtual std::vector<int> Directions(void)   { return geom.directions;};
+  virtual std::vector<int> Displacements(void){ return geom.displacements;};
+};
+
+
+template<class Field> class SolverWrapper : public LinearFunction<Field> {
+private:
+  LinearOperatorBase<Field> & _Matrix;
+  OperatorFunction<Field> & _Solver;
+  LinearFunction<Field>   & _Guess;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+  SolverWrapper(LinearOperatorBase<Field> &Matrix,
+	      OperatorFunction<Field> &Solver,
+	      LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    _Guess(in,out);
+    _Solver(_Matrix,in,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+// Must use a non-hermitian solver
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
+
+  void OpDiag (const Field &in, Field &out) {
+    assert(0);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    assert(0);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};
+
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & _SmootherMatrix;
+  FineOperator   & _SmootherOperator;
+  
+  Chebyshev<Field> Cheby;
+
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) :
+    _SmootherOperator(SmootherOperator),
+    _SmootherMatrix(SmootherMatrix),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    MdagMLinearOperator<Matrix,Field>   MdagMOp(_SmootherMatrix); 
+    _SmootherOperator.AdjOp(in,tmp);
+    Cheby(MdagMOp,tmp,out);         
+  }
+};
+template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef CoarseCayleyFermion<Fobj,CComplex,nbasis> CoarseOperator;
+  //  typedef SparseMatrixBase<CoarseVector> CoarseOperator;
+
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _CoarseOperator.ProjectToSubspace(_Aggregates,Csrc,vec1);
+    //    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    //    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
+class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  //typedef CoarseCayleyFermion<Fobj,CComplex,nbasis> CoarseOperator;
+  typedef SparseMatrixBase<CoarseVector> CoarseOperator;
+
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  HDCRPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector g5Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage<<"\t\t\t" << "Calling PreSmoother " <<std::endl;
+
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage<<"\t\t\t" << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+
+    // Fine to Coarse 
+    // Based on a coarsening of G5R5 D
+    // Solves Ddwf out = in
+    // Coarse operator is g5R5 Ddwf solves g5R5 Ddwf out = in  
+    t=-usecond();
+    G5R5(vec2,vec1);
+    _Aggregates.ProjectToSubspace  (Csrc,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage<<"\t\t\t" << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    t+=usecond();
+    std::cout<<GridLogMessage<<"\t\t\t" << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage<<"\t\t\t" << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+
+    // Fine Smoother
+    t=-usecond();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage<<"\t\t\t" << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  ///////////////////////////////////////////////////
+  // Construct a coarsened grid; utility for this?
+  ///////////////////////////////////////////////////
+  std::vector<int> block ({2,2,2,2}); // 4,2,2,2 gets worse
+  std::vector<int> blockc ({1,1,1,1});
+  const int nbasis= 24;
+  const int nbasisc= 32; // decrease, not improvement
+
+  auto clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/block[d];
+  }
+  auto cclatt = clatt;
+  for(int d=0;d<clatt.size();d++){
+    cclatt[d] = clatt[d]/blockc[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(Ls,Coarse4d);
+
+  GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *CoarseCoarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds);
+  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(seeds);
+
+  LatticeGaugeField Umu(UGrid); 
+#if 0
+  SU3::TepidConfiguration(RNG4,Umu);
+  RealD M5=1.0;
+#else
+  std::string file("./ckpoint_lat.1000");
+  FieldMetaData header;
+  NerscIO::readConfiguration(Umu,header,file);
+  RealD M5=1.8;
+#endif
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  RealD mass=0.00078;
+
+  WilsonFermionR     Dw(Umu,*UGrid,*UrbGrid,-M5);
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionR Dpv (Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
+  typedef CoarseOperator::CoarseVector                                 CoarseVector;
+  typedef CoarseOperator::siteVector siteVector;
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  // How to find criticall mass?
+  // WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.75); //   600 iters
+  // WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.80); //   800 iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.82); // 1023 iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.85); // 1428 iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.87); //  1900 iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.90); // 3900   iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.92); // 6200   iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.94);  // 8882 iters
+  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.95);  // 9170  iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.96);  // 8882   iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.97);  // 8406  iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.99); // 6900   iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-1.01); // 6397   iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-1.00); // 5900   iters
+  MdagMLinearOperator<WilsonFermionR,LatticeFermion> MdagM_Dw(Dw_null);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Testing Wilson criticality " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  /*
+  ConjugateGradient<LatticeFermion>          WilsonCG(1.0e-10,40000);
+  LatticeFermion w_src(UGrid); w_src=1.0;
+  LatticeFermion w_res(UGrid);
+  WilsonCG(MdagM_Dw,w_src,w_res);
+  exit(0);
+  */
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " 4D subspace build                                " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Subspace Aggregates4D(Coarse4d,UGrid,0);
+  assert ( (nbasis & 0x1)==0);
+  int nb=nbasis/2;
+  Gamma g5(Gamma::Algebra::Gamma5);
+
+  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,3.0,300,150,150,0.0); // now at 26 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,3.0,300,150,150,0.0); // now at 26 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,3.0,500,150,150,0.0); // now at 26 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,500,150,150,0.0); // now at 26 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,500,150,150,0.0); //35
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,600,100,250,0.0); //39
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,600,250,100,0.0); //39
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,600,100,100,0.0); //36
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,600,250,100,0.0); //39
+  //   Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,600,250,50,0.0);// 39
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,600,250,250,0.0);// 35
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,250,250,250,0.0);//  38 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,50.0,4.0,250,250,100,0.0);//  40 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,50.0,4.0,500,100,100,0.0);// 38 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,500,100,100,0.0);// 36 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,3.0,500,100,100,0.0);// 37 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,1.0,1000,400,400,0.0);// 38 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,0.5,1000,400,400,0.0);// 39 iter HDCR smooth 14
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,0.1,1000,400,400,0.0);// 41
+  for(int n=0;n<nb;n++){
+    Aggregates4D.subspace[nbasis-1-n]= Aggregates4D.subspace[n] - g5 * Aggregates4D.subspace[n];
+    Aggregates4D.subspace[n]         = Aggregates4D.subspace[n] + g5 * Aggregates4D.subspace[n];
+  }
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Coarsen the Dw operator                          " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>    Level1Op4;
+  typedef CoarseCayleyFermion<vSpinColourVector,vTComplex,nbasis> Level1Op5;
+  Level1Op4 c_Dw    (*Coarse4d,0);
+  NonHermitianLinearOperator<WilsonFermionR,LatticeFermion>  LinOpDw(Dw);
+  c_Dw.CoarsenOperator(UGrid,LinOpDw,Aggregates4D); // contains the M5 from Dw(-M5)
+  //  c_Dw.Test(Aggregates4D,UGrid,LinOpDw);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Build coarse DWF operator                          " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Level1Op5 c_Dwf  (*Coarse4d,*Coarse5d,c_Dw,M5, mass, Ls, 1.0,0.0);
+  //  c_Dwf.Test(Aggregates4D,FGrid,Ddwf);
+
+  MdagMLinearOperator<Level1Op5,CoarseVector> MdagM_cDwf(c_Dwf);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Build 5D coarse deflation space" << std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  int nbc=nbasisc/2;
+  typedef CoarsenedMatrix<siteVector,iScalar<vTComplex>,nbasisc>     Level2Op;
+  typedef Aggregation<siteVector,iScalar<vTComplex>,nbasisc> CoarseSubspace;
+  CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Build Chebyshev space in coarse operator "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  CoarseAggregates.CreateSubspaceChebyshev(CRNG,MdagM_cDwf,nbc,40.0,0.01,300,150,100,0.0);
+
+  {
+    std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+    std::cout<<GridLogMessage << "Applying G5R5 projection of coarse operator "<< std::endl;
+    std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+    CoarseVector A(Coarse5d), B(Coarse5d);
+    for(int n=0;n<nbc;n++){
+      std::cout << " n " << n << " " << 4.0*norm2(CoarseAggregates.subspace[n])<<std::endl;
+      G5R5(B,CoarseAggregates.subspace[n]);
+      A = CoarseAggregates.subspace[n];
+      CoarseAggregates.subspace[n]    = A+B; // 1+G5R5 // eigen value of G5R5 is +1
+      CoarseAggregates.subspace[n+nbc]= A-B; // 1-G5R5 // eigen value of G5R5 is -1
+      std::cout << norm2(CoarseAggregates.subspace[n])<<" "<< norm2(CoarseAggregates.subspace[n+nbc])<<std::endl;
+      std::cout << norm2(CoarseAggregates.subspace[n])+ norm2(CoarseAggregates.subspace[n+nbc])<<std::endl;
+    }
+  }
+
+  Gamma5R5HermitianLinearOperator<Level1Op5,CoarseVector> L1Hdwf(c_Dwf);
+  GridRedBlackCartesian * CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d);
+  Level2Op cc_Dwf  (*CoarseCoarse5d,*CoarseCoarse5dRB,1); // say it is hermitian
+  cc_Dwf.CoarsenOperator(Coarse5d,L1Hdwf,CoarseAggregates);
+  //  cc_Dwf.Test(CoarseAggregates,Coarse5d,L1Hdwf);
+
+  typedef Level2Op::CoarseVector CoarseCoarseVector;
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Testing fine and coarse solvers " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  RealD tol=1.0e-8;
+  int MaxIt = 10000;
+
+  CoarseVector c_src(Coarse5d); c_src=1.0;
+  CoarseVector c_res(Coarse5d);
+
+  LatticeFermion f_src(FGrid); f_src=1.0;
+  LatticeFermion f_res(FGrid);
+
+  LatticeFermion f_src_e(FrbGrid); f_src_e=1.0;
+  LatticeFermion f_res_e(FrbGrid);
+
+  CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0;
+
+  ConjugateGradient<CoarseVector>            CoarseCG(tol,MaxIt);
+  ConjugateGradient<LatticeFermion>          FineCG(tol,MaxIt);
+  
+  NonHermitianLinearOperator<DomainWallFermionR,LatticeFermion> FineM(Ddwf);
+  MdagMLinearOperator<DomainWallFermionR,LatticeFermion>    FineMdagM(Ddwf);     //  M^\dag M
+
+  NonHermitianLinearOperator<Level1Op5,CoarseVector> CoarseM(c_Dwf);
+  MdagMLinearOperator<Level1Op5,CoarseVector> CoarseMdagM(c_Dwf);
+
+  NonHermitianLinearOperator<Level2Op,CoarseCoarseVector> CoarseCoarseM(cc_Dwf);
+  MdagMLinearOperator<Level2Op,CoarseCoarseVector> CoarseCoarseMdagM(cc_Dwf);
+
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Fine    Hw PowerMethod           "<< std::endl;
+  LatticeFermion w_src(UGrid); 
+  w_src=1.0;
+  PowerMethod<LatticeFermion>       PM;   PM(MdagM_Dw,w_src);
+  std::cout<<GridLogMessage << " Coarse       PowerMethod           "<< std::endl;
+  c_src=1.0;
+  PowerMethod<CoarseVector>        cPM;  cPM(CoarseMdagM,c_src);
+
+  cc_src=1.0;
+  PowerMethod<CoarseCoarseVector>        ccPM;  ccPM(CoarseCoarseMdagM,cc_src);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Running CoarseCoarse grid Lanczos "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  // 37s, 26 iter
+  int cNk=128;
+  int cNm=256;
+  int cNstop=128;
+  RealD IRL_lo=0.01;
+  RealD IRL_hi=16.0;
+  int IRL_ord=201;
+
+  /*
+  //  int cNk=100; -- slower 27 iters
+  int cNk=128; //-- 26 iters, but slower
+  int cNm=192;
+  int cNstop=128;
+  RealD IRL_lo=0.005;
+  RealD IRL_hi=10.0;
+  int IRL_ord=101;
+  */
+
+  MdagMLinearOperator<Level2Op,CoarseCoarseVector> IRLHermOpL2(cc_Dwf);
+  Chebyshev<CoarseCoarseVector> IRLChebyL2(IRL_lo,IRL_hi,IRL_ord);
+  FunctionHermOp<CoarseCoarseVector> IRLOpChebyL2(IRLChebyL2,IRLHermOpL2);
+  PlainHermOp<CoarseCoarseVector> IRLOpL2    (IRLHermOpL2);
+  ImplicitlyRestartedLanczos<CoarseCoarseVector> IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20);
+
+  int cNconv;
+  cNm=0;
+  std::vector<RealD>          eval2(cNm);
+  std::vector<CoarseCoarseVector>   evec2(cNm,CoarseCoarse5d);
+  cc_src=1.0;
+  //  IRLL2.calc(eval2,evec2,cc_src,cNconv);
+
+  ConjugateGradient<CoarseCoarseVector>  CoarseCoarseCG(0.02,10000);
+  DeflatedGuesser<CoarseCoarseVector> DeflCoarseCoarseGuesser(evec2,eval2);
+  NormalEquations<CoarseCoarseVector> DeflCoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,DeflCoarseCoarseGuesser);
+
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  ZeroGuesser<CoarseCoarseVector>       CoarseCoarseZeroGuesser;
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building 3 level hdcr                             "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  NormalEquations<CoarseCoarseVector>   CoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,CoarseCoarseZeroGuesser);
+  {
+typedef HDCRPreconditioner<siteVector,iScalar<vTComplex>,nbasisc,NormalEquations<CoarseCoarseVector> > CoarseMG;
+  typedef MGPreconditioner<vSpinColourVector,  vTComplex,nbasis, LinearFunction<CoarseVector> >     ThreeLevelMG;
+
+  // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space
+  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother1(0.5,22.0,12,CoarseM,c_Dwf); // 37s, 26 iter
+  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother2(0.5,22.0,12,CoarseM,c_Dwf);
+
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother1(0.5,22.0,7,CoarseM,c_Dwf); // 38s, 26 iter
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother2(0.5,22.0,7,CoarseM,c_Dwf);
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother1(0.4,22.0,7,CoarseM,c_Dwf); // 41s, 27 iter
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother2(0.4,22.0,7,CoarseM,c_Dwf);
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother1(0.6,22.0,6,CoarseM,c_Dwf); // 26 iter
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother2(0.6,22.0,6,CoarseM,c_Dwf);
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother1(0.5,22.0,5,CoarseM,c_Dwf); // 33 iter, 55s
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother2(0.5,22.0,5,CoarseM,c_Dwf);
+
+
+  CoarseMG Level2Precon (CoarseAggregates,
+			 CoarseM,
+			 CoarseSmoother1,
+			 CoarseSmoother2,
+			 cc_Dwf,
+			 DeflCoarseCoarseCGNE);
+  Level2Precon.Level(2);
+
+  //PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(0.5, 100, CoarseM,Level2Precon,16,16); // 26 iter, 37s
+  // PGCR Applying this solver to solve the coarse space problem
+  // COULD BE FIXED???
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); 
+
+  //PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0, 100, CoarseM,Level2Precon,16,16); // 35 iter, 45s
+  //PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(0.6, 100, CoarseM,Level2Precon,16,16); // 26,38 (diifferene is measurement noise)
+  //PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(0.2, 100, CoarseM,Level2Precon,16,16); // 26 iter, 47s
+  L2PGCR.Level(2);
+
+  // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,14,FineM,Ddwf); // 26 iter, 39s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,14,FineM,Ddwf);
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 38s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,16,FineM,Ddwf);
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 23 iter, 39s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,20,FineM,Ddwf);
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,10,FineM,Ddwf);24 iter, 44s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,24,FineM,Ddwf);
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,12,FineM,Ddwf); // odd convergence tail at 10^-9 ish
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.1,60.0,24,FineM,Ddwf); // 33 iter, waas O(10-9 by 26)
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 39s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,18,FineM,Ddwf); //
+
+  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,16,FineM,Ddwf); 
+  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,16,FineM,Ddwf); //
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,11,FineM,Ddwf); // 33 iter, 49s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,11,FineM,Ddwf);
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 26 iter, 37s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,12,FineM,Ddwf);
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.4,60.0,12,FineM,Ddwf); //  iter 26 no change in final residual
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.4,60.0,12,FineM,Ddwf);
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.3,60.0,12,FineM,Ddwf); // 27 iter 39s.
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.3,60.0,12,FineM,Ddwf);
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.3,60.0,13,FineM,Ddwf); // 26 iter, but slower
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.3,60.0,13,FineM,Ddwf);
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(1.0,60.0,12,FineM,Ddwf); // 34 iter, slower
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(1.0,60.0,12,FineM,Ddwf);
+
+  ThreeLevelMG ThreeLevelPrecon(Aggregates4D,
+				FineM,
+				FineSmoother1,
+				FineSmoother2,
+				c_Dwf,
+				L2PGCR);
+  ThreeLevelPrecon.Level(1);
+
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,FineM,ThreeLevelPrecon,16,16);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+  }
+  
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Grid_finalize();
+  
+}
diff --git a/tests/solver/Test_hw_multigrid_mixed_48_rb.cc b/tests/solver/Test_hw_multigrid_mixed_48_rb.cc
new file mode 100644
index 00000000..41257e94
--- /dev/null
+++ b/tests/solver/Test_hw_multigrid_mixed_48_rb.cc
@@ -0,0 +1,1326 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_hdcr.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+// TODO
+//
+// Coarse Grid axpby_ssp_pminus // Inherit from spProj5pm
+// Coarse Grid axpby_ssp_pplus
+
+template<class Field,class Coeff_t>
+class CayleyBase : public SparseMatrixBase<Field> 
+{
+public:
+  int Ls;
+  //    protected:
+  RealD mass;
+  RealD M5;
+  // Save arguments to SetCoefficientsInternal
+  Vector<Coeff_t> _gamma;
+  RealD                _zolo_hi;
+  RealD                _b;
+  RealD                _c;
+
+  // Cayley form Moebius (tanh and zolotarev)
+  Vector<Coeff_t> omega;
+  Vector<Coeff_t> bs;    // S dependent coeffs
+  Vector<Coeff_t> cs;
+  Vector<Coeff_t> as;
+  // For preconditioning Cayley form
+  Vector<Coeff_t> bee;
+  Vector<Coeff_t> cee;
+  Vector<Coeff_t> aee;
+  Vector<Coeff_t> beo;
+  Vector<Coeff_t> ceo;
+  Vector<Coeff_t> aeo;
+  // LDU factorisation of the eeoo matrix
+  Vector<Coeff_t> lee;
+  Vector<Coeff_t> leem;
+  Vector<Coeff_t> uee;
+  Vector<Coeff_t> ueem;
+  Vector<Coeff_t> dee;
+public:
+  CayleyBase(RealD _M5, RealD _mass, int _Ls, RealD b_, RealD c_) :
+    M5(_M5),
+    mass(_mass),
+    Ls(_Ls),
+    _b(b_),
+    _c(c_)
+  {
+    RealD eps = 1.0;
+    Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
+    this->SetCoefficientsTanh(zdata,1.0,0.0);
+    Approx::zolotarev_free(zdata);
+  }
+  /////////////////////////////////////////////////////////
+  // Replicates functionality
+  // Use a common base class approach
+  /////////////////////////////////////////////////////////
+  // Tanh
+  void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
+  {
+    Vector<Coeff_t> gamma(this->Ls);
+    for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+    SetCoefficientsInternal(1.0,gamma,b,c);
+  }
+  //Zolo
+  void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+  {
+    Vector<Coeff_t> gamma(this->Ls);
+    for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+    SetCoefficientsInternal(zolo_hi,gamma,b,c);
+  }
+  //Zolo
+  void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
+  {
+    int Ls=this->Ls;
+
+    ///////////////////////////////////////////////////////////
+    // The Cayley coeffs (unprec)
+    ///////////////////////////////////////////////////////////
+    assert(gamma.size()==Ls);
+
+    omega.resize(Ls);
+    bs.resize(Ls);
+    cs.resize(Ls);
+    as.resize(Ls);
+    
+    double bpc = b+c;
+    double bmc = b-c;
+    _b = b;
+    _c = c;
+    _gamma  = gamma; // Save the parameters so we can change mass later.
+    _zolo_hi= zolo_hi;
+    for(int i=0; i < Ls; i++){
+      as[i] = 1.0;
+      omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code
+      assert(omega[i]!=Coeff_t(0.0));
+      bs[i] = 0.5*(bpc/omega[i] + bmc);
+      cs[i] = 0.5*(bpc/omega[i] - bmc);
+    }
+
+    ////////////////////////////////////////////////////////
+    // Constants for the preconditioned matrix Cayley form
+    ////////////////////////////////////////////////////////
+    bee.resize(Ls);
+    cee.resize(Ls);
+    beo.resize(Ls);
+    ceo.resize(Ls);
+    
+    for(int i=0;i<Ls;i++){
+      bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
+      assert(bee[i]!=Coeff_t(0.0));
+      cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
+      beo[i]=as[i]*bs[i];
+      ceo[i]=-as[i]*cs[i];
+    }
+    aee.resize(Ls);
+    aeo.resize(Ls);
+    for(int i=0;i<Ls;i++){
+      aee[i]=cee[i];
+      aeo[i]=ceo[i];
+    }
+    
+    //////////////////////////////////////////
+    // LDU decomposition of eeoo
+    //////////////////////////////////////////
+    dee.resize(Ls);
+    lee.resize(Ls);
+    leem.resize(Ls);
+    uee.resize(Ls);
+    ueem.resize(Ls);
+  
+    for(int i=0;i<Ls;i++){
+      
+      dee[i] = bee[i];
+      
+      if ( i < Ls-1 ) {
+	
+	assert(bee[i]!=Coeff_t(0.0));
+	assert(bee[0]!=Coeff_t(0.0));
+      
+	lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
+      
+	leem[i]=mass*cee[Ls-1]/bee[0];
+	for(int j=0;j<i;j++) {
+	  assert(bee[j+1]!=Coeff_t(0.0));
+	  leem[i]*= aee[j]/bee[j+1];
+	}
+      
+	uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
+	
+	ueem[i]=mass;
+	for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
+	ueem[i]*= aee[0]/bee[0];
+      
+      } else { 
+	lee[i] =0.0;
+	leem[i]=0.0;
+	uee[i] =0.0;
+	ueem[i]=0.0;
+      }
+    }
+    
+    { 
+      Coeff_t delta_d=mass*cee[Ls-1];
+      for(int j=0;j<Ls-1;j++) {
+	assert(bee[j] != Coeff_t(0.0));
+	delta_d *= cee[j]/bee[j];
+      }
+      dee[Ls-1] += delta_d;
+    }  
+  };
+
+  //////////////////////////////
+  // M and Mdag
+  //////////////////////////////
+  virtual  void Mdiag    (const Field &in, Field &out) {assert(0);}
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
+  virtual  void DW       (const Field &psi, Field &chi)=0;
+  virtual  void DWDag    (const Field &psi, Field &chi)=0;
+
+  void M    (const Field &psi, Field &chi)
+  {
+    Field Din(psi.Grid());
+    Meooe5D(psi,Din);
+    DW(Din,chi);
+    axpby(chi,1.0,1.0,chi,psi); 
+    M5D(psi,chi);
+  }
+  void Mdag (const Field &psi, Field &chi)
+  {
+    Field Din(psi.Grid());
+    DWDag(psi,Din); 
+    MeooeDag5D(Din,chi);
+    M5Ddag(psi,chi);
+    axpby (chi,1.0,1.0,chi,psi); 
+  }
+  /////////////////////////////////
+  // P and Pdag - might be needed
+  /////////////////////////////////
+  void P(const Field &psi, Field &chi)
+  {
+    int Ls= this->Ls;
+    chi=Zero();
+    for(int s=0;s<Ls;s++){
+      axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
+      axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s+1)%Ls);
+    }
+  }
+  void Pdag(const Field &psi, Field &chi)
+  {
+    int Ls= this->Ls;
+    chi=Zero();
+    for(int s=0;s<Ls;s++){
+      axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
+      axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s-1+Ls)%Ls);
+    }
+  }
+  ////////////////////////////////////////////////////////
+  // Depends: Dw, M5D,  M5Ddag, Meooe5D, MeooeDag5D,
+  ////////////////////////////////////////////////////////
+  void M5D   (const Field &psi, Field &chi)
+  {
+    int Ls=this->Ls;
+    Vector<Coeff_t> diag (Ls,1.0);
+    Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+    Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
+    M5D(psi,chi,chi,lower,diag,upper);
+  }
+  void M5Ddag (const Field &psi, Field &chi)
+  {
+    int Ls=this->Ls;
+    Vector<Coeff_t> diag(Ls,1.0);
+    Vector<Coeff_t> upper(Ls,-1.0);
+    Vector<Coeff_t> lower(Ls,-1.0);
+    upper[Ls-1]=-mass*upper[Ls-1];
+    lower[0]   =-mass*lower[0];
+    M5Ddag(psi,chi,chi,lower,diag,upper);
+  }
+  void Meooe5D    (const Field &psi, Field &Din)
+  {
+    int Ls=this->Ls;
+    Vector<Coeff_t> diag = bs;
+    Vector<Coeff_t> upper= cs;
+    Vector<Coeff_t> lower= cs; 
+    upper[Ls-1]=-mass*upper[Ls-1];
+    lower[0]   =-mass*lower[0];
+    M5D(psi,psi,Din,lower,diag,upper);
+  }
+  void MeooeDag5D    (const Field &psi, Field &Din)
+  {
+    int Ls=this->Ls;
+    Vector<Coeff_t> diag =bs;
+    Vector<Coeff_t> upper=cs;
+    Vector<Coeff_t> lower=cs; 
+    
+    for (int s=0;s<Ls;s++){
+      if ( s== 0 ) {
+	upper[s] = cs[s+1];
+	lower[s] =-mass*cs[Ls-1];
+      } else if ( s==(Ls-1) ) { 
+	upper[s] =-mass*cs[0];
+	lower[s] = cs[s-1];
+      } else { 
+	upper[s] = cs[s+1];
+	lower[s] = cs[s-1];
+      }
+      upper[s] = conjugate(upper[s]);
+      lower[s] = conjugate(lower[s]);
+      diag[s]  = conjugate(diag[s]);
+    }
+    M5Ddag(psi,psi,Din,lower,diag,upper);
+  }
+
+  void M5D(const Field &psi_i,
+	   const Field &phi_i, 
+	   Field &chi_i,
+	   Vector<Coeff_t> &lower,
+	   Vector<Coeff_t> &diag,
+	   Vector<Coeff_t> &upper)
+  {
+    chi_i.Checkerboard()=psi_i.Checkerboard();
+    GridBase *grid=psi_i.Grid();
+    autoView(psi , psi_i,AcceleratorRead);
+    autoView(phi , phi_i,AcceleratorRead);
+    autoView(chi , chi_i,AcceleratorWrite);
+    assert(phi.Checkerboard() == psi.Checkerboard());
+
+    auto pdiag = &diag[0];
+    auto pupper = &upper[0];
+    auto plower = &lower[0];
+
+    int Ls =this->Ls;
+    
+    // 10 = 3 complex mult + 2 complex add
+    // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
+    uint64_t nloop = grid->oSites()/Ls;
+
+    const int Nsimd = Field::vector_type::Nsimd();
+    accelerator_for(sss,nloop,Nsimd,{
+	uint64_t ss= sss*Ls;
+	typedef decltype(coalescedRead(psi[0])) spinor;
+	spinor tmp1, tmp2;
+	for(int s=0;s<Ls;s++){
+	  uint64_t idx_u = ss+((s+1)%Ls);
+	  uint64_t idx_l = ss+((s+Ls-1)%Ls);
+	  spProj5m(tmp1,psi(idx_u)); // Need routines for this
+	  spProj5p(tmp2,psi(idx_l));
+	  coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
+	}
+      });
+  }
+  void M5Ddag(const Field &psi_i,
+	      const Field &phi_i, 
+	      Field &chi_i,
+	      Vector<Coeff_t> &lower,
+	      Vector<Coeff_t> &diag,
+	      Vector<Coeff_t> &upper)
+  {
+    chi_i.Checkerboard()=psi_i.Checkerboard();
+    GridBase *grid=psi_i.Grid();
+    autoView(psi , psi_i,AcceleratorRead);
+    autoView(phi , phi_i,AcceleratorRead);
+    autoView(chi , chi_i,AcceleratorWrite);
+    assert(phi.Checkerboard() == psi.Checkerboard());
+    
+    auto pdiag = &diag[0];
+    auto pupper = &upper[0];
+    auto plower = &lower[0];
+    
+    int Ls=this->Ls;
+    
+    uint64_t nloop = grid->oSites()/Ls;
+    const int Nsimd = Field::vector_type::Nsimd();
+    accelerator_for(sss,nloop,Nsimd,{
+	uint64_t ss=sss*Ls;
+	typedef decltype(coalescedRead(psi[0])) spinor;
+	spinor tmp1,tmp2;
+	for(int s=0;s<Ls;s++){
+	  uint64_t idx_u = ss+((s+1)%Ls);
+	  uint64_t idx_l = ss+((s+Ls-1)%Ls);
+	  spProj5p(tmp1,psi(idx_u));
+	  spProj5m(tmp2,psi(idx_l));
+	  coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
+	}
+      });
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis>
+class CoarseCayleyFermion  : public CayleyBase< Lattice<iVector<CComplex,nbasis > > , ComplexD >
+{
+public:
+  typedef iVector<CComplex,nbasis >           siteVector;
+  typedef Lattice<CComplex >                  CoarseComplexField;
+  typedef Lattice<siteVector>                 CoarseVector;
+  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+  typedef iMatrix<CComplex,nbasis >  Cobj;
+  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj >        FineField;
+
+  // Similar to the CoarseOperator but add 5D support.
+  Geometry  geom;
+  GridBase *Coarse5D;
+  GridBase *Coarse4D;
+  CartesianStencil<siteVector,siteVector,int> Stencil; 
+  CoarsenedMatrix<Fobj,CComplex,nbasis> &Dw;
+
+  GridBase * Grid(void)         { return Coarse5D; };   // this is all the linalg routines need to know
+
+  CoarseCayleyFermion(GridCartesian &CoarseGrid4,
+		      GridCartesian &CoarseGrid5,
+		      CoarsenedMatrix<Fobj,CComplex,nbasis> &_Dw,
+		      RealD M5, RealD mass, int Ls, RealD b, RealD c) :
+    CayleyBase<CoarseVector,ComplexD>(M5,mass,Ls,b,c),
+    Coarse4D(&CoarseGrid4),
+    Coarse5D(&CoarseGrid5),
+    Dw(_Dw),
+    geom(CoarseGrid5._ndimension),
+    Stencil( &CoarseGrid5,geom.npoint,Even,geom.directions,geom.displacements,0)
+  { 
+  };
+
+public:
+  void Project( CoarseVector &C )
+  {
+    const int Nsimd = CComplex::Nsimd();
+    autoView(Cv,C, AcceleratorWrite);
+    int Ls = this->Ls;
+    for(int s=0;s<Ls;s++){
+      accelerator_for(sU, Coarse4D->oSites(), Nsimd, {
+	  int sF= sU*Ls+s;
+	  auto tmp = coalescedRead(Cv[sF]);
+	  coalescedWrite(Cv[sF],tmp);
+      });
+    }
+  }
+  ////////////////////////////////////////////////
+  // This is specific to Coarse Grid Cayley
+  ////////////////////////////////////////////////
+  virtual  void Mdiag    (const CoarseVector &in, CoarseVector &out)
+  {
+    std::vector<CoarseVector> allout(9,in.Grid());
+    this->MdirAll(in,allout);
+    out = allout[8];
+  }
+  virtual  void Mdir     (const CoarseVector &in, CoarseVector &out,int dir, int disp)
+  {
+    assert(0);
+  }
+  virtual  void MdirAll  (const CoarseVector &in, std::vector<CoarseVector> &out)
+  {
+    conformable(Coarse5D,in.Grid());
+
+    SimpleCompressor<siteVector> compressor;
+
+    Stencil.HaloExchange(in,compressor);
+    typedef LatticeView<Cobj> Aview;
+      
+    const int Nsimd = CComplex::Nsimd();
+    
+    // Ls loop for2D
+    int Ls=this->Ls;
+
+    siteVector *CBp=Stencil.CommBuf();			
+
+    int ptype;
+    int nb2=nbasis/2;
+    
+    autoView(in_v ,   in, AcceleratorRead);
+    autoView(st, Stencil, AcceleratorRead);
+    for(int point=0;point<geom.npoint;point++){
+      
+      autoView(out_v,  out[point], AcceleratorWrite);
+      autoView(Aview,Dw.A[point],AcceleratorRead);
+
+      accelerator_for2d(sF, Coarse5D->oSites(), b, nbasis, Nsimd, {
+
+	  typedef decltype(coalescedRead(in_v[0])) calcVector;
+	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+	  int sU = sF/Ls;
+	  int  s = sF%Ls;
+
+	  calcComplex res = Zero();
+	  calcVector  nbr;
+	  int ptype;
+	    
+	  StencilEntry *SE=st.GetEntry(ptype,point,sF);
+	  
+	  if(SE->_is_local) { 
+	    nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+	  } else {
+	    nbr = coalescedRead(CBp[SE->_offset]);
+	  }
+	  acceleratorSynchronise();
+
+	  for(int bb=0;bb<nbasis;bb++) {
+	    res = res + coalescedRead(Aview[sU](b,bb))*nbr(bb);
+	  }
+	  
+	  coalescedWrite(out_v[sF](b),res);
+      });
+    }      
+  }
+  void DW (const CoarseVector &in, CoarseVector &out)
+  {
+    conformable(Coarse5D,in.Grid());
+    conformable(in.Grid(),out.Grid());
+
+    SimpleCompressor<siteVector> compressor;
+
+    Stencil.HaloExchange(in,compressor);
+    typedef LatticeView<Cobj> Aview;
+      
+    const int Nsimd = CComplex::Nsimd();
+    
+    // Ls loop for2D
+    int Ls=this->Ls;
+
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(Dw.A[p].View(AcceleratorRead));
+    Aview *Aview_p = & AcceleratorViewContainer[0];
+    autoView(in_v ,   in, AcceleratorRead);
+    autoView(out_v,  out, AcceleratorWrite);
+    autoView(st, Stencil, AcceleratorRead);
+    siteVector *CBp=Stencil.CommBuf();			
+
+    int ptype;
+    int nb2=nbasis/2;
+    accelerator_for2d(sF, Coarse5D->oSites(), b, nbasis, Nsimd, {
+
+      typedef decltype(coalescedRead(in_v[0])) calcVector;
+      typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+      int sU = sF/Ls;
+      int  s = sF%Ls;
+
+      calcComplex res = Zero();
+
+      {
+      	calcVector  nbr;
+	int ptype;
+
+	for(int point=0;point<geom.npoint;point++){
+
+	  StencilEntry *SE=st.GetEntry(ptype,point,sF);
+	  
+	  if(SE->_is_local) { 
+	    nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+	  } else {
+	    nbr = coalescedRead(CBp[SE->_offset]);
+	  }
+	  acceleratorSynchronise();
+
+	  for(int bb=0;bb<nbasis;bb++) {
+	    res = res + coalescedRead(Aview_p[point][sU](b,bb))*nbr(bb);
+	  }	  
+	}
+      }
+      coalescedWrite(out_v[sF](b),res);
+      });
+      
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+  };
+
+  void DWDag (const CoarseVector &in, CoarseVector &out)
+  {
+    // Inefficient G5 hermitian use
+    CoarseVector tmp(Grid());
+    G5C(tmp, in); //There has to be a better way
+    DW(tmp, out);
+    G5C(out, out);
+  };
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+
+  void PromoteFromSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) 
+  {
+    auto FineGrid4 = _Aggregates.FineGrid;
+    FineField F4(FineGrid4);
+    CoarseVector C4(Coarse4D);
+    for(int s=0;s<this->Ls;s++){
+      ExtractSlice(C4,C,s,0);
+      _Aggregates.PromoteFromSubspace(C4,F4); 
+      InsertSlice(F4,F,s,0);
+    }      
+  }
+  void ProjectToSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) 
+  {
+    auto FineGrid4 = _Aggregates.FineGrid;
+    FineField F4(FineGrid4);
+    CoarseVector C4(Coarse4D);
+    for(int s=0;s<this->Ls;s++){
+      ExtractSlice(F4,F,s,0);
+      _Aggregates.ProjectToSubspace  (C4,F4);
+      InsertSlice(C4,C,s,0);
+    }
+    Project(C);
+  }
+  template<class Ddwf>
+  void Test(Aggregates &_Aggregates,GridBase *FineGrid, Ddwf &_Ddwf)
+  {
+    typedef Lattice<Fobj> FineField;
+    CoarseVector Cin(Coarse5D);
+    CoarseVector Cout(Coarse5D);
+    CoarseVector CFout(Coarse5D);
+
+    FineField Fin(FineGrid);
+    FineField Fout(FineGrid);
+
+
+    std::vector<int> seeds({1,2,3,4,5});
+    GridParallelRNG RNG(Coarse5D);  RNG.SeedFixedIntegers(seeds);
+
+    gaussian(RNG,Cin);
+    PromoteFromSubspace(_Aggregates,Cin,Fin);
+    ProjectToSubspace(_Aggregates,Cin,Fin);
+
+    std::cout << GridLogMessage<< "************  "<<std::endl;
+    std::cout << GridLogMessage<< " Testing M  "<<std::endl;
+    std::cout << GridLogMessage<< "************  "<<std::endl;
+    // Coarse operator
+    this->M(Cin,Cout);
+    this->Project(Cout);
+    std::cout << GridLogMessage<< " Cout  "<<norm2(Cout)<<std::endl;
+
+    // Fine projected operator
+    PromoteFromSubspace(_Aggregates,Cin,Fin);
+    _Ddwf.M(Fin,Fout);
+    ProjectToSubspace(_Aggregates,CFout,Fout);
+    std::cout << GridLogMessage<< " CFout "<<norm2(CFout)<<std::endl;
+    CFout = CFout-Cout;
+    std::cout << GridLogMessage<< " diff  "<<norm2(CFout)<<std::endl;
+
+    std::cout << GridLogMessage<< "************  "<<std::endl;
+    std::cout << GridLogMessage<< " Testing Mdag  "<<std::endl;
+    std::cout << GridLogMessage<< "************  "<<std::endl;
+    // Coarse operator
+    this->Mdag(Cin,Cout);
+    this->Project(Cout);
+    std::cout << GridLogMessage<< " Cout  "<<norm2(Cout)<<std::endl;
+
+    // Fine operator
+    _Ddwf.Mdag(Fin,Fout);
+    ProjectToSubspace(_Aggregates,CFout,Fout);
+    std::cout << GridLogMessage<< " CFout "<<norm2(CFout)<<std::endl;
+    CFout = CFout-Cout;
+    std::cout << GridLogMessage<< " diff  "<<norm2(CFout)<<std::endl;
+ 
+  }
+  virtual std::vector<int> Directions(void)   { return geom.directions;};
+  virtual std::vector<int> Displacements(void){ return geom.displacements;};
+};
+
+template<class Field> class SchurSolverWrapper : public LinearFunction<Field> {
+private:
+  CheckerBoardedSparseMatrixBase<Field> & _Matrix;
+  SchurRedBlackBase<Field> & _Solver;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+  SchurSolverWrapper(CheckerBoardedSparseMatrixBase<Field> &Matrix,
+		SchurRedBlackBase<Field> &Solver)
+   :  _Matrix(Matrix), _Solver(Solver) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    _Solver(_Matrix,in,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+template<class Field> class SolverWrapper : public LinearFunction<Field> {
+private:
+  LinearOperatorBase<Field> & _Matrix;
+  OperatorFunction<Field> & _Solver;
+  LinearFunction<Field>   & _Guess;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+  SolverWrapper(LinearOperatorBase<Field> &Matrix,
+	      OperatorFunction<Field> &Solver,
+	      LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    _Guess(in,out);
+    _Solver(_Matrix,in,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+// Must use a non-hermitian solver
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
+
+  void OpDiag (const Field &in, Field &out) {
+    assert(0);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    assert(0);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};
+
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & _SmootherMatrix;
+  FineOperator   & _SmootherOperator;
+  
+  Chebyshev<Field> Cheby;
+
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) :
+    _SmootherOperator(SmootherOperator),
+    _SmootherMatrix(SmootherMatrix),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    MdagMLinearOperator<Matrix,Field>   MdagMOp(_SmootherMatrix); 
+    _SmootherOperator.AdjOp(in,tmp);
+    Cheby(MdagMOp,tmp,out);         
+  }
+};
+template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef CoarseCayleyFermion<Fobj,CComplex,nbasis> CoarseOperator;
+  //  typedef SparseMatrixBase<CoarseVector> CoarseOperator;
+
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _CoarseOperator.ProjectToSubspace(_Aggregates,Csrc,vec1);
+    //    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    //    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
+class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  //typedef CoarseCayleyFermion<Fobj,CComplex,nbasis> CoarseOperator;
+  typedef SparseMatrixBase<CoarseVector> CoarseOperator;
+
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  HDCRPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector g5Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage<<"\t\t\t" << "Calling PreSmoother " <<std::endl;
+
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage<<"\t\t\t" << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+
+    // Fine to Coarse 
+    // Based on a coarsening of G5R5 D
+    // Solves Ddwf out = in
+    // Coarse operator is g5R5 Ddwf solves g5R5 Ddwf out = in  
+    t=-usecond();
+    G5R5(vec2,vec1);
+    _Aggregates.ProjectToSubspace  (Csrc,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage<<"\t\t\t" << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    t+=usecond();
+    std::cout<<GridLogMessage<<"\t\t\t" << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage<<"\t\t\t" << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+
+    // Fine Smoother
+    t=-usecond();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage<<"\t\t\t" << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  ///////////////////////////////////////////////////
+  // Construct a coarsened grid; utility for this?
+  ///////////////////////////////////////////////////
+  std::vector<int> block ({2,2,2,2}); // 4,2,2,2 gets worse
+  std::vector<int> blockc ({1,1,1,1});
+  const int nbasis= 24;
+  const int nbasisc= 40; // decrease, not improvement
+
+  auto clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/block[d];
+  }
+  auto cclatt = clatt;
+  for(int d=0;d<clatt.size();d++){
+    cclatt[d] = clatt[d]/blockc[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(Ls,Coarse4d);
+  //  GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d);
+  //  GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,Coarse4d);
+
+  GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *CoarseCoarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d);
+  //  GridRedBlackCartesian * CoarseCoarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseCoarse4d);
+  GridRedBlackCartesian * CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds);
+  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(seeds);
+
+  LatticeGaugeField Umu(UGrid); 
+#if 0
+  SU3::TepidConfiguration(RNG4,Umu);
+  RealD M5=1.0;
+#else
+  std::string file("./ckpoint_lat.1000");
+  FieldMetaData header;
+  NerscIO::readConfiguration(Umu,header,file);
+  RealD M5=1.8;
+#endif
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  RealD mass=0.00078;
+
+  WilsonFermionR     Dw(Umu,*UGrid,*UrbGrid,-M5);
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionR Dpv (Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
+  typedef CoarseOperator::CoarseVector                                 CoarseVector;
+  typedef CoarseOperator::siteVector siteVector;
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  // How to find criticall mass?
+  // WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.75); //   600 iters
+  // WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.80); //   800 iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.82); // 1023 iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.85); // 1428 iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.87); //  1900 iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.90); // 3900   iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.92); // 6200   iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.94);  // 8882 iters
+  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.95);  // 9170  iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.96);  // 8882   iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.97);  // 8406  iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-0.99); // 6900   iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-1.01); // 6397   iters
+  //  WilsonFermionR     Dw_null(Umu,*UGrid,*UrbGrid,-1.00); // 5900   iters
+  MdagMLinearOperator<WilsonFermionR,LatticeFermion> MdagM_Dw(Dw_null);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Testing Wilson criticality " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  /*
+  ConjugateGradient<LatticeFermion>          WilsonCG(1.0e-10,40000);
+  LatticeFermion w_src(UGrid); w_src=1.0;
+  LatticeFermion w_res(UGrid);
+  WilsonCG(MdagM_Dw,w_src,w_res);
+  exit(0);
+  */
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " 4D subspace build                                " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Subspace Aggregates4D(Coarse4d,UGrid,0);
+  assert ( (nbasis & 0x1)==0);
+  int nb=nbasis/2;
+  Gamma g5(Gamma::Algebra::Gamma5);
+
+  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,3.0,300,150,150,0.0); // now at 26 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,3.0,300,150,150,0.0); // now at 26 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,3.0,500,150,150,0.0); // now at 26 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,500,150,150,0.0); // now at 26 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,500,150,150,0.0); //35
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,600,100,250,0.0); //39
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,600,250,100,0.0); //39
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,600,100,100,0.0); //36
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,600,250,100,0.0); //39
+  //   Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,600,250,50,0.0);// 39
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,600,250,250,0.0);// 35
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,250,250,250,0.0);//  38 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,50.0,4.0,250,250,100,0.0);//  40 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,50.0,4.0,500,100,100,0.0);// 38 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,4.0,500,100,100,0.0);// 36 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,3.0,500,100,100,0.0);// 37 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,1.0,1000,400,400,0.0);// 38 iter
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,0.5,1000,400,400,0.0);// 39 iter HDCR smooth 14
+  //  Aggregates4D.CreateSubspaceChebyshev(RNG4,MdagM_Dw,nb,60.0,0.1,1000,400,400,0.0);// 41
+  for(int n=0;n<nb;n++){
+    Aggregates4D.subspace[nbasis-1-n]= Aggregates4D.subspace[n] - g5 * Aggregates4D.subspace[n];
+    Aggregates4D.subspace[n]         = Aggregates4D.subspace[n] + g5 * Aggregates4D.subspace[n];
+  }
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Coarsen the Dw operator                          " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>    Level1Op4;
+  typedef CoarseCayleyFermion<vSpinColourVector,vTComplex,nbasis> Level1Op5;
+  Level1Op4 c_Dw    (*Coarse4d,0);
+  NonHermitianLinearOperator<WilsonFermionR,LatticeFermion>  LinOpDw(Dw);
+  c_Dw.CoarsenOperator(UGrid,LinOpDw,Aggregates4D); // contains the M5 from Dw(-M5)
+  //  c_Dw.Test(Aggregates4D,UGrid,LinOpDw);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Build coarse DWF operator                          " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Level1Op5 c_Dwf  (*Coarse4d,*Coarse5d,c_Dw,M5, mass, Ls, 1.0,0.0);
+  //  c_Dwf.Test(Aggregates4D,FGrid,Ddwf);
+
+  MdagMLinearOperator<Level1Op5,CoarseVector> MdagM_cDwf(c_Dwf);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Build 5D coarse deflation space" << std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  int nbc=nbasisc/2;
+  typedef CoarsenedMatrix<siteVector,iScalar<vTComplex>,nbasisc>     Level2Op;
+  typedef Aggregation<siteVector,iScalar<vTComplex>,nbasisc> CoarseSubspace;
+  CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Build Chebyshev space in coarse operator "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  //  CoarseAggregates.CreateSubspaceChebyshev(CRNG,MdagM_cDwf,nbc,40.0,0.01,300,150,100,0.0);
+  CoarseAggregates.CreateSubspaceChebyshev(CRNG,MdagM_cDwf,nbc,40.0,0.005,500,150,100,0.0);
+
+  {
+    std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+    std::cout<<GridLogMessage << "Applying G5R5 projection of coarse operator "<< std::endl;
+    std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+    CoarseVector A(Coarse5d), B(Coarse5d);
+    for(int n=0;n<nbc;n++){
+      G5R5(B,CoarseAggregates.subspace[n]);
+      A = CoarseAggregates.subspace[n];
+      CoarseAggregates.subspace[n]    = A+B; // 1+G5R5 // eigen value of G5R5 is +1
+      CoarseAggregates.subspace[n+nbc]= A-B; // 1-G5R5 // eigen value of G5R5 is -1
+    }
+  }
+
+  Gamma5R5HermitianLinearOperator<Level1Op5,CoarseVector> L1Hdwf(c_Dwf);
+  Level2Op cc_Dwf  (*CoarseCoarse5d,*CoarseCoarse5dRB,1); // say it is hermitian
+  cc_Dwf.CoarsenOperator(Coarse5d,L1Hdwf,CoarseAggregates);
+  //  cc_Dwf.Test(CoarseAggregates,Coarse5d,L1Hdwf);
+
+  typedef Level2Op::CoarseVector CoarseCoarseVector;
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Testing fine and coarse solvers " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  RealD tol=1.0e-8;
+  int MaxIt = 10000;
+
+  CoarseVector c_src(Coarse5d); c_src=1.0;
+  CoarseVector c_res(Coarse5d);
+
+  LatticeFermion f_src(FGrid); f_src=1.0;
+  LatticeFermion f_res(FGrid);
+
+  LatticeFermion f_src_e(FrbGrid); f_src_e=1.0;
+  LatticeFermion f_res_e(FrbGrid);
+
+  CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0;
+
+  ConjugateGradient<CoarseVector>            CoarseCG(tol,MaxIt);
+  ConjugateGradient<LatticeFermion>          FineCG(tol,MaxIt);
+  
+  NonHermitianLinearOperator<DomainWallFermionR,LatticeFermion> FineM(Ddwf);
+  MdagMLinearOperator<DomainWallFermionR,LatticeFermion>    FineMdagM(Ddwf);     //  M^\dag M
+
+  NonHermitianLinearOperator<Level1Op5,CoarseVector> CoarseM(c_Dwf);
+  MdagMLinearOperator<Level1Op5,CoarseVector> CoarseMdagM(c_Dwf);
+
+  NonHermitianLinearOperator<Level2Op,CoarseCoarseVector> CoarseCoarseM(cc_Dwf);
+  MdagMLinearOperator<Level2Op,CoarseCoarseVector> CoarseCoarseMdagM(cc_Dwf);
+
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Fine    Hw PowerMethod           "<< std::endl;
+  LatticeFermion w_src(UGrid); 
+  w_src=1.0;
+  PowerMethod<LatticeFermion>       PM;   PM(MdagM_Dw,w_src);
+  std::cout<<GridLogMessage << " Coarse       PowerMethod           "<< std::endl;
+  c_src=1.0;
+  PowerMethod<CoarseVector>        cPM;  cPM(CoarseMdagM,c_src);
+
+  cc_src=1.0;
+  PowerMethod<CoarseCoarseVector>        ccPM;  ccPM(CoarseCoarseMdagM,cc_src);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Running CoarseCoarse grid Lanczos "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  // 37s, 26 iter
+  int cNk=128;
+  int cNm=256;
+  int cNstop=128;
+  RealD IRL_lo=0.01;
+  RealD IRL_hi=16.0;
+  int IRL_ord=201;
+
+  /*
+  //  int cNk=100; -- slower 27 iters
+  int cNk=128; //-- 26 iters, but slower
+  int cNm=192;
+  int cNstop=128;
+  RealD IRL_lo=0.005;
+  RealD IRL_hi=10.0;
+  int IRL_ord=101;
+  */
+
+  MdagMLinearOperator<Level2Op,CoarseCoarseVector> IRLHermOpL2(cc_Dwf);
+  Chebyshev<CoarseCoarseVector> IRLChebyL2(IRL_lo,IRL_hi,IRL_ord);
+  FunctionHermOp<CoarseCoarseVector> IRLOpChebyL2(IRLChebyL2,IRLHermOpL2);
+  PlainHermOp<CoarseCoarseVector> IRLOpL2    (IRLHermOpL2);
+  ImplicitlyRestartedLanczos<CoarseCoarseVector> IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20);
+
+  int cNconv;
+  cNm=0;
+  std::vector<RealD>          eval2(cNm);
+  std::vector<CoarseCoarseVector>   evec2(cNm,CoarseCoarse5d);
+  cc_src=1.0;
+  //  IRLL2.calc(eval2,evec2,cc_src,cNconv);
+  
+  std::vector<RealD> tols ({0.005,0.001});
+  std::vector<RealD> c_los  ({0.1,0.05});
+  std::vector<RealD> c_his  ({22.0});
+  std::vector<RealD> f_los  ({0.5,0.2});
+  std::vector<RealD> f_his  ({60.0});
+  std::vector<int> ws ({2,3});
+  std::vector<int> c_ords ({32,24});
+  std::vector<int> f_ords ({20,16});
+
+  for(auto w : ws ) {
+  for(auto tol : tols ) {
+  for(auto f_ord : f_ords ) {
+  for(auto c_ord : c_ords ) {
+  for(auto c_lo : c_los ) {
+  for(auto c_hi : c_his ) {
+  for(auto f_lo : f_los ) {
+  for(auto f_hi : f_his ) {
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  ZeroGuesser<CoarseCoarseVector>       CoarseCoarseZeroGuesser;
+  ConjugateGradient<CoarseCoarseVector>  CoarseCoarseCG(tol,10000);
+  ZeroGuesser<CoarseCoarseVector> CoarseCoarseGuesser;
+  SchurRedBlackDiagMooeeSolve<CoarseCoarseVector> CoarseCoarseRBCG(CoarseCoarseCG);
+  SchurSolverWrapper<CoarseCoarseVector> CoarseCoarseSolver(cc_Dwf,CoarseCoarseRBCG);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building 3 level hdcr                             "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  //  NormalEquations<CoarseCoarseVector>   CoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,CoarseCoarseZeroGuesser);
+  {
+typedef HDCRPreconditioner<siteVector,iScalar<vTComplex>,nbasisc,LinearFunction<CoarseCoarseVector> > CoarseMG;
+  typedef MGPreconditioner<vSpinColourVector,  vTComplex,nbasis, LinearFunction<CoarseVector> >     ThreeLevelMG;
+
+  // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother1(0.5,22.0,c_ord,CoarseM,c_Dwf); // 37s, 26 iter
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother2(0.5,22.0,c_ord,CoarseM,c_Dwf);
+  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother(c_lo,c_hi,c_ord,CoarseM,c_Dwf); // 37s, 26 iter
+
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother1(0.5,22.0,7,CoarseM,c_Dwf); // 38s, 26 iter
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother2(0.5,22.0,7,CoarseM,c_Dwf);
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother1(0.4,22.0,7,CoarseM,c_Dwf); // 41s, 27 iter
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother2(0.4,22.0,7,CoarseM,c_Dwf);
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother1(0.6,22.0,6,CoarseM,c_Dwf); // 26 iter
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother2(0.6,22.0,6,CoarseM,c_Dwf);
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother1(0.5,22.0,5,CoarseM,c_Dwf); // 33 iter, 55s
+  //  ChebyshevSmoother<CoarseVector,  Level1Op5 >       CoarseSmoother2(0.5,22.0,5,CoarseM,c_Dwf);
+
+
+  CoarseMG Level2Precon (CoarseAggregates,
+			 CoarseM,
+			 CoarseSmoother,
+			 CoarseSmoother,
+			 cc_Dwf,
+			 CoarseCoarseSolver);
+  Level2Precon.Level(2);
+
+  //PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(0.5, 100, CoarseM,Level2Precon,16,16); // 26 iter, 37s
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2);  // 296 s, 50 iter
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2);  // 250 s, 37 iter
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); 
+
+  //PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0, 100, CoarseM,Level2Precon,16,16); // 35 iter, 45s
+  //PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(0.6, 100, CoarseM,Level2Precon,16,16); // 26,38 (diifferene is measurement noise)
+  //PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(0.2, 100, CoarseM,Level2Precon,16,16); // 26 iter, 47s
+  L2PGCR.Level(2);
+
+  // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,14,FineM,Ddwf); // 26 iter, 39s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,14,FineM,Ddwf);
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 38s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,16,FineM,Ddwf);
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 23 iter, 39s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,20,FineM,Ddwf);
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,10,FineM,Ddwf);24 iter, 44s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,24,FineM,Ddwf);
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,12,FineM,Ddwf); // odd convergence tail at 10^-9 ish
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.1,60.0,24,FineM,Ddwf); // 33 iter, waas O(10-9 by 26)
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 39s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,18,FineM,Ddwf); //
+
+  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(f_lo,f_hi,f_ord,FineM,Ddwf); 
+
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,11,FineM,Ddwf); // 33 iter, 49s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,11,FineM,Ddwf);
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 26 iter, 37s
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.5,60.0,12,FineM,Ddwf);
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.4,60.0,12,FineM,Ddwf); //  iter 26 no change in final residual
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.4,60.0,12,FineM,Ddwf);
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.3,60.0,12,FineM,Ddwf); // 27 iter 39s.
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.3,60.0,12,FineM,Ddwf);
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(0.3,60.0,13,FineM,Ddwf); // 26 iter, but slower
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(0.3,60.0,13,FineM,Ddwf);
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother1(1.0,60.0,12,FineM,Ddwf); // 34 iter, slower
+  //  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother2(1.0,60.0,12,FineM,Ddwf);
+
+  ThreeLevelMG ThreeLevelPrecon(Aggregates4D,
+				FineM,
+				FineSmoother,
+				FineSmoother,
+				c_Dwf,
+				L2PGCR);
+  ThreeLevelPrecon.Level(1);
+
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,FineM,ThreeLevelPrecon,16,16);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+  }
+  }}}}  
+  }}}
+  }
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Grid_finalize();
+  
+}

From 99445673f65e3792c5e14a3c634da7c40c2dfa33 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 14 Jan 2021 21:00:36 -0500
Subject: [PATCH 14/16] Gparity fix, and plaquette IO

---
 Grid/parallelIO/IldgIO.h                      | 22 +++---
 Grid/parallelIO/MetaData.h                    | 34 +++-----
 Grid/parallelIO/NerscIO.h                     | 46 +++++------
 Grid/parallelIO/OpenQcdIO.h                   |  2 +-
 Grid/parallelIO/OpenQcdIOChromaReference.h    |  2 +-
 Grid/qcd/action/gauge/Gauge.cc                | 38 +++++++++
 Grid/qcd/action/gauge/GaugeImplementations.h  | 79 +++++++++++--------
 Grid/qcd/hmc/checkpointers/BaseCheckpointer.h |  3 +-
 Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h |  5 +-
 .../qcd/hmc/checkpointers/NerscCheckpointer.h |  7 +-
 Grid/qcd/modules/Modules.h                    |  2 +-
 Grid/qcd/utils/CovariantCshift.h              | 51 ++++++++++++
 Grid/tensors/Tensor_Ta.h                      | 14 +++-
 tests/core/Test_reunitarise.cc                |  3 +-
 tests/hmc/Test_hmc_EODWFRatio_Gparity.cc      |  7 +-
 tests/hmc/Test_hmc_GparityIwasakiGauge.cc     |  4 +
 tests/hmc/Test_hmc_GparityWilsonGauge.cc      |  3 +
 17 files changed, 220 insertions(+), 102 deletions(-)
 create mode 100644 Grid/qcd/action/gauge/Gauge.cc

diff --git a/Grid/parallelIO/IldgIO.h b/Grid/parallelIO/IldgIO.h
index b564371b..ef42c159 100644
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -123,7 +123,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
  ////////////////////////////////////////////////////////////
  // Helper to fill out metadata
  ////////////////////////////////////////////////////////////
- template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
+template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
 					  FieldMetaData &header,
 					  scidacRecord & _scidacRecord,
 					  scidacFile   & _scidacFile) 
@@ -619,12 +619,12 @@ class IldgWriter : public ScidacWriter {
   // Don't require scidac records EXCEPT checksum
   // Use Grid MetaData object if present.
   ////////////////////////////////////////////////////////////////
-  template <class vsimd>
-  void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) 
+  template <class stats = PeriodicGaugeStatistics>
+  void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,int sequence,std::string LFN,std::string description) 
   {
     GridBase * grid = Umu.Grid();
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-    typedef iLorentzColourMatrix<vsimd> vobj;
+    typedef Lattice<vLorentzColourMatrixD> GaugeField;
+    typedef vLorentzColourMatrixD vobj;
     typedef typename vobj::scalar_object sobj;
 
     ////////////////////////////////////////
@@ -636,6 +636,9 @@ class IldgWriter : public ScidacWriter {
 
     ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
 
+    stats Stats;
+    Stats(Umu,header);
+    
     std::string format = header.floating_point;
     header.ensemble_id    = description;
     header.ensemble_label = description;
@@ -705,10 +708,10 @@ class IldgReader : public GridLimeReader {
   // Else use ILDG MetaData object if present.
   // Else use SciDAC MetaData object if present.
   ////////////////////////////////////////////////////////////////
-  template <class vsimd>
-  void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
+  template <class stats = PeriodicGaugeStatistics>
+  void readConfiguration(Lattice<vLorentzColourMatrixD> &Umu, FieldMetaData &FieldMetaData_) {
 
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+    typedef Lattice<vLorentzColourMatrixD > GaugeField;
     typedef typename GaugeField::vector_object  vobj;
     typedef typename vobj::scalar_object sobj;
 
@@ -921,7 +924,8 @@ class IldgReader : public GridLimeReader {
 
     if ( found_FieldMetaData || found_usqcdInfo ) {
       FieldMetaData checker;
-      GaugeStatistics(Umu,checker);
+      stats Stats;
+      Stats(Umu,checker);
       assert(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
       assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
       std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
diff --git a/Grid/parallelIO/MetaData.h b/Grid/parallelIO/MetaData.h
index 4c1cfbdb..d30ba523 100644
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -176,29 +176,18 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
   GridMetaData(grid,header); 
   MachineCharacteristics(header);
 }
-inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
+template<class Impl>
+class GaugeStatistics
 {
-  // How to convert data precision etc...
-  header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
-  header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
-}
-inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
-{
-  // How to convert data precision etc...
-  header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
-  header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
-}
-template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
-{
-   
-  GridBase *grid = field.Grid();
-  std::string format = getFormatString<vLorentzColourMatrixF>();
-  header.floating_point = format;
-  header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
-  GridMetaData(grid,header); 
-  GaugeStatistics(field,header);
-  MachineCharacteristics(header);
-}
+public:
+  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
+  {
+    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
+    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
+  }
+};
+typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
+typedef GaugeStatistics<ConjugateGimplD> ConjugateGaugeStatistics;
 template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
 {
   GridBase *grid = field.Grid();
@@ -206,7 +195,6 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
   header.floating_point = format;
   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
   GridMetaData(grid,header); 
-  GaugeStatistics(field,header);
   MachineCharacteristics(header);
 }
 
diff --git a/Grid/parallelIO/NerscIO.h b/Grid/parallelIO/NerscIO.h
index 5522ba91..3ebdf0cc 100644
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -40,6 +40,8 @@ using namespace Grid;
 class NerscIO : public BinaryIO { 
 public:
 
+  typedef Lattice<vLorentzColourMatrixD> GaugeField;
+
   static inline void truncate(std::string file){
     std::ofstream fout(file,std::ios::out);
   }
@@ -129,12 +131,12 @@ public:
   // Now the meat: the object readers
   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-  template<class vsimd>
-  static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
+  template<class GaugeStats=PeriodicGaugeStatistics>
+  static inline void readConfiguration(GaugeField &Umu,
 				       FieldMetaData& header,
-				       std::string file)
+				       std::string file,
+				       GaugeStats GaugeStatisticsCalculator=GaugeStats())
   {
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
 
     GridBase *grid = Umu.Grid();
     uint64_t offset = readHeader(file,Umu.Grid(),header);
@@ -153,23 +155,23 @@ public:
     // munger is a function of <floating point, Real, data_type>
     if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
       if ( ieee32 || ieee32big ) {
-	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
+	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
       }
       if ( ieee64 || ieee64big ) {
-	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
+	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3D> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
       }
     } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
       if ( ieee32 || ieee32big ) {
-	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
+	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
       }
       if ( ieee64 || ieee64big ) {
-	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
+	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixD>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
       }
@@ -177,7 +179,7 @@ public:
       assert(0);
     }
 
-    GaugeStatistics(Umu,clone);
+    GaugeStats Stats; Stats(Umu,clone);
 
     std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
 	     <<" header   "<<std::hex<<header.checksum<<std::dec <<std::endl;
@@ -203,15 +205,13 @@ public:
     std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
   }
 
-  template<class vsimd>
-  static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
+  template<class GaugeStats=PeriodicGaugeStatistics>
+  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32)
   {
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-
-    typedef iLorentzColourMatrix<vsimd> vobj;
+    typedef vLorentzColourMatrixD vobj;
     typedef typename vobj::scalar_object sobj;
 
     FieldMetaData header;
@@ -229,7 +229,7 @@ public:
 
     GridMetaData(grid,header);
     assert(header.nd==4);
-    GaugeStatistics(Umu,header);
+    GaugeStats Stats; Stats(Umu,header);
     MachineCharacteristics(header);
 
 	uint64_t offset;
@@ -238,19 +238,19 @@ public:
     header.floating_point = std::string("IEEE64BIG");
     header.data_type      = std::string("4D_SU3_GAUGE_3x3");
     GaugeSimpleUnmunger<fobj3D,sobj> munge;
-	if ( grid->IsBoss() ) { 
-	  truncate(file);
-    offset = writeHeader(header,file);
-	}
-	grid->Broadcast(0,(void *)&offset,sizeof(offset));
+    if ( grid->IsBoss() ) { 
+      truncate(file);
+      offset = writeHeader(header,file);
+    }
+    grid->Broadcast(0,(void *)&offset,sizeof(offset));
 
     uint32_t nersc_csum,scidac_csuma,scidac_csumb;
     BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 					      nersc_csum,scidac_csuma,scidac_csumb);
     header.checksum = nersc_csum;
-	if ( grid->IsBoss() ) { 
-    writeHeader(header,file);
-	}
+    if ( grid->IsBoss() ) { 
+      writeHeader(header,file);
+    }
 
     std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
 	     <<std::hex<<header.checksum
diff --git a/Grid/parallelIO/OpenQcdIO.h b/Grid/parallelIO/OpenQcdIO.h
index 00911595..0be2c88d 100644
--- a/Grid/parallelIO/OpenQcdIO.h
+++ b/Grid/parallelIO/OpenQcdIO.h
@@ -154,7 +154,7 @@ public:
     grid->Barrier(); timer.Stop();
     std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
 
-    GaugeStatistics(Umu, clone);
+    PeriodicGaugeStatistics Stats; Stats(Umu, clone);
 
     RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
 
diff --git a/Grid/parallelIO/OpenQcdIOChromaReference.h b/Grid/parallelIO/OpenQcdIOChromaReference.h
index bab54fe8..886536ad 100644
--- a/Grid/parallelIO/OpenQcdIOChromaReference.h
+++ b/Grid/parallelIO/OpenQcdIOChromaReference.h
@@ -208,7 +208,7 @@ public:
 
     FieldMetaData clone(header);
 
-    GaugeStatistics(Umu, clone);
+    PeriodicGaugeStatistics Stats; Stats(Umu, clone);
 
     RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
 
diff --git a/Grid/qcd/action/gauge/Gauge.cc b/Grid/qcd/action/gauge/Gauge.cc
new file mode 100644
index 00000000..2b5e2691
--- /dev/null
+++ b/Grid/qcd/action/gauge/Gauge.cc
@@ -0,0 +1,38 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/gauge/Gauge.cc
+
+Copyright (C) 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+std::vector<int> ConjugateGaugeImplBase::_conjDirs;
+
+NAMESPACE_END(Grid);
+
diff --git a/Grid/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h
index a14aec1b..16147c77 100644
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -59,14 +59,14 @@ public:
   }
   static inline GaugeLinkField
   CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-    return Cshift(adj(Link), mu, -1);
+    return PeriodicBC::CovShiftIdentityBackward(Link, mu);
   }
   static inline GaugeLinkField
   CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
-    return Link;
+    return PeriodicBC::CovShiftIdentityForward(Link,mu);
   }
   static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
-    return Cshift(Link, mu, 1);
+    return PeriodicBC::ShiftStaple(Link,mu);
   }
 
   static inline bool isPeriodicGaugeField(void) { return true; }
@@ -74,7 +74,13 @@ public:
 
 // Composition with smeared link, bc's etc.. probably need multiple inheritance
 // Variable precision "S" and variable Nc
-template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes {
+class ConjugateGaugeImplBase {
+protected:
+  static std::vector<int> _conjDirs;
+};
+
+  template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes, ConjugateGaugeImplBase {
+private:
 public:
   INHERIT_GIMPL_TYPES(GimplTypes);
 
@@ -84,47 +90,56 @@ public:
   ////////////////////////////////////////////////////////////////////////////////////////////////////////////
   template <class covariant>
   static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
-                                            const Lattice<covariant> &field) {
-    return ConjugateBC::CovShiftForward(Link, mu, field);
+                                            const Lattice<covariant> &field)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CovShiftForward(Link, mu, field);
+    else
+      return PeriodicBC::CovShiftForward(Link, mu, field);
   }
 
   template <class covariant>
   static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
-                                             const Lattice<covariant> &field) {
-    return ConjugateBC::CovShiftBackward(Link, mu, field);
+                                             const Lattice<covariant> &field)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CovShiftBackward(Link, mu, field);
+    else 
+      return PeriodicBC::CovShiftBackward(Link, mu, field);
   }
 
   static inline GaugeLinkField
-  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-    GridBase *grid = Link.Grid();
-    int Lmu = grid->GlobalDimensions()[mu] - 1;
-
-    Lattice<iScalar<vInteger>> coor(grid);
-    LatticeCoordinate(coor, mu);
-
-    GaugeLinkField tmp(grid);
-    tmp = adj(Link);
-    tmp = where(coor == Lmu, conjugate(tmp), tmp);
-    return Cshift(tmp, mu, -1); // moves towards positive mu
+  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CovShiftIdentityBackward(Link, mu);
+    else 
+      return PeriodicBC::CovShiftIdentityBackward(Link, mu);
   }
   static inline GaugeLinkField
-  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
-    return Link;
+  CovShiftIdentityForward(const GaugeLinkField &Link, int mu)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CovShiftIdentityForward(Link,mu);
+    else
+      return PeriodicBC::CovShiftIdentityForward(Link,mu);
   }
 
-  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
-    GridBase *grid = Link.Grid();
-    int Lmu = grid->GlobalDimensions()[mu] - 1;
-
-    Lattice<iScalar<vInteger>> coor(grid);
-    LatticeCoordinate(coor, mu);
-
-    GaugeLinkField tmp(grid);
-    tmp = Cshift(Link, mu, 1);
-    tmp = where(coor == Lmu, conjugate(tmp), tmp);
-    return tmp;
+  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::ShiftStaple(Link,mu);
+    else     
+      return PeriodicBC::ShiftStaple(Link,mu);
   }
 
+  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
+  static inline std::vector<int> getDirections(void) { return _conjDirs; }
   static inline bool isPeriodicGaugeField(void) { return false; }
 };
 
diff --git a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
index 3cd05ebc..c09fdeeb 100644
--- a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
@@ -74,7 +74,7 @@ public:
       conf_file = os.str();
     }
   } 
-
+  virtual ~BaseHmcCheckpointer(){};
   void check_filename(const std::string &filename){
     std::ifstream f(filename.c_str());
     if(!f.good()){
@@ -82,7 +82,6 @@ public:
       abort();
     };
   }
-
   virtual void initialize(const CheckpointerParameters &Params) = 0;
 
   virtual void CheckpointRestore(int traj, typename Impl::Field &U,
diff --git a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
index 269caa6e..1bb8aa1a 100644
--- a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -45,6 +45,7 @@ private:
 
 public:
   INHERIT_GIMPL_TYPES(Implementation);
+  typedef GaugeStatistics<Implementation> GaugeStats;
 
   ILDGHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
 
@@ -78,7 +79,7 @@ public:
       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
       IldgWriter _IldgWriter(grid->IsBoss());
       _IldgWriter.open(config);
-      _IldgWriter.writeConfiguration(U, traj, config, config);
+      _IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config);
       _IldgWriter.close();
 
       std::cout << GridLogMessage << "Written ILDG Configuration on " << config
@@ -105,7 +106,7 @@ public:
     FieldMetaData header;
     IldgReader _IldgReader;
     _IldgReader.open(config);
-    _IldgReader.readConfiguration(U,header);  // format from the header
+    _IldgReader.readConfiguration<GaugeStats>(U,header);  // format from the header
     _IldgReader.close();
 
     std::cout << GridLogMessage << "Read ILDG Configuration from " << config
diff --git a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
index cfcc44d8..4534e4c4 100644
--- a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
@@ -43,7 +43,8 @@ private:
 
 public:
   INHERIT_GIMPL_TYPES(Gimpl);  // only for gauge configurations
-
+  typedef GaugeStatistics<Gimpl> GaugeStats;
+  
   NerscHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
 
   void initialize(const CheckpointerParameters &Params_) {
@@ -60,7 +61,7 @@ public:
       int precision32 = 1;
       int tworow = 0;
       NerscIO::writeRNGState(sRNG, pRNG, rng);
-      NerscIO::writeConfiguration(U, config, tworow, precision32);
+      NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32);
     }
   };
 
@@ -74,7 +75,7 @@ public:
 
     FieldMetaData header;
     NerscIO::readRNGState(sRNG, pRNG, header, rng);
-    NerscIO::readConfiguration(U, header, config);
+    NerscIO::readConfiguration<GaugeStats>(U, header, config);
   };
 };
 
diff --git a/Grid/qcd/modules/Modules.h b/Grid/qcd/modules/Modules.h
index 1c1c8889..7aa3f0ac 100644
--- a/Grid/qcd/modules/Modules.h
+++ b/Grid/qcd/modules/Modules.h
@@ -99,7 +99,7 @@ public:
   virtual Prod* getPtr() = 0;
 
   // add a getReference? 
-  
+  virtual ~HMCModuleBase(){};
   virtual void print_parameters(){};  // default to nothing
 };
 
diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h
index cee1fa12..6c70706f 100644
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@@ -53,6 +53,24 @@ namespace PeriodicBC {
     return Cshift(tmp,mu,-1);// moves towards positive mu
   }
 
+  template<class gauge> Lattice<gauge>
+  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) 
+  {
+    return Cshift(adj(Link), mu, -1);
+  }
+
+  template<class gauge> Lattice<gauge>
+  CovShiftIdentityForward(const Lattice<gauge> &Link, int mu)
+  {
+    return Link;
+  }
+
+  template<class gauge> Lattice<gauge>
+  ShiftStaple(const Lattice<gauge> &Link, int mu)
+  {
+    return Cshift(Link, mu, 1);
+  }
+  
   template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
     auto  CovShiftForward(const Lattice<gauge> &Link, 
 			  int mu,
@@ -70,6 +88,7 @@ namespace PeriodicBC {
     return CovShiftBackward(Link,mu,arg);
   }
 
+
 }
 
 
@@ -139,6 +158,38 @@ namespace ConjugateBC {
     //    std::cout<<"Gparity::CovCshiftBackward mu="<<mu<<std::endl;
     return Cshift(tmp,mu,-1);// moves towards positive mu
   }
+  template<class gauge> Lattice<gauge>
+  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) {
+    GridBase *grid = Link.Grid();
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    Lattice<gauge> tmp(grid);
+    tmp = adj(Link);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return Cshift(tmp, mu, -1); // moves towards positive mu
+  }
+  template<class gauge> Lattice<gauge>
+  CovShiftIdentityForward(const Lattice<gauge> &Link, int mu) {
+    return Link;
+  }
+
+  template<class gauge> Lattice<gauge>
+  ShiftStaple(const Lattice<gauge> &Link, int mu)
+  {
+    GridBase *grid = Link.Grid();
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    Lattice<gauge> tmp(grid);
+    tmp = Cshift(Link, mu, 1);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return tmp;
+  }
 
   template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
     auto  CovShiftForward(const Lattice<gauge> &Link, 
diff --git a/Grid/tensors/Tensor_Ta.h b/Grid/tensors/Tensor_Ta.h
index bbaa4a00..90e57b2b 100644
--- a/Grid/tensors/Tensor_Ta.h
+++ b/Grid/tensors/Tensor_Ta.h
@@ -117,7 +117,19 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
 	ret._internal[b][c] -= pr * ret._internal[c1][c];
       }
     }
-	  
+  }
+
+  // Normalise last row
+  {
+    int c1 = N-1;
+    zeroit(inner);	
+    for(int c2=0;c2<N;c2++)
+      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
+
+    nrm = sqrt(inner);
+    nrm = 1.0/nrm;
+    for(int c2=0;c2<N;c2++)
+      ret._internal[c1][c2]*= nrm;
   }
   // assuming the determinant is ok
   return ret;
diff --git a/tests/core/Test_reunitarise.cc b/tests/core/Test_reunitarise.cc
index 9a6781f1..af164a75 100644
--- a/tests/core/Test_reunitarise.cc
+++ b/tests/core/Test_reunitarise.cc
@@ -113,7 +113,8 @@ int main (int argc, char ** argv)
     auto element = PeekIndex<ColourIndex>(U,Nc-1,i);
     element = element * phase;
     PokeIndex<ColourIndex>(U,element,Nc-1,i);
-  }  
+  }
+  U=U*0.1;
   UU=U;
 
   detU= Determinant(U) ;
diff --git a/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc b/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc
index 3434fccc..9ca0b0a0 100644
--- a/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc
+++ b/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc
@@ -81,6 +81,10 @@ int main(int argc, char **argv) {
   // that have a complex construction
   // standard
   RealD beta = 5.6 ;
+  const int nu = 3;
+  std::vector<int> twists(Nd,0);
+  twists[nu] = 1;
+  ConjugateGimplD::setDirections(twists);
   ConjugateWilsonGaugeActionR Waction(beta);
 
   const int Ls = 8;
@@ -93,9 +97,6 @@ int main(int argc, char **argv) {
   // temporarily need a gauge field
   LatticeGaugeField U(GridPtr);
 
-  const int nu = 3;
-  std::vector<int> twists(Nd,0);
-  twists[nu] = 1;
   FermionAction::ImplParams params;
   params.twists = twists;
   Real mass=0.04;
diff --git a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc
index bc47b6c2..7f74d5d8 100644
--- a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc
+++ b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc
@@ -79,6 +79,10 @@ int main(int argc, char **argv) {
   // that have a complex construction
   // standard
   RealD beta = 2.6 ;
+  const int nu = 3;
+  std::vector<int> twists(Nd,0);
+  twists[nu] = 1;
+  ConjugateGimplD::setDirections(twists);
   ConjugateIwasakiGaugeActionR Waction(beta);
 
  
diff --git a/tests/hmc/Test_hmc_GparityWilsonGauge.cc b/tests/hmc/Test_hmc_GparityWilsonGauge.cc
index eb057181..b8c078fe 100644
--- a/tests/hmc/Test_hmc_GparityWilsonGauge.cc
+++ b/tests/hmc/Test_hmc_GparityWilsonGauge.cc
@@ -80,6 +80,9 @@ int main(int argc, char **argv) {
   // that have a complex construction
   // standard
   RealD beta = 5.6 ;
+  std::vector<int> twists(Nd,0);
+  twists[3] = 1;
+  ConjugateGimplD::setDirections(twists);
   ConjugateWilsonGaugeActionR Waction(beta);
    
 

From 3c23a947cc4e22b6c01afd9eac5d5a4add9035c7 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 15 Jan 2021 09:16:02 -0500
Subject: [PATCH 15/16] Fixed test for very much non-unit det

---
 tests/core/Test_reunitarise.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/core/Test_reunitarise.cc b/tests/core/Test_reunitarise.cc
index af164a75..6644be1a 100644
--- a/tests/core/Test_reunitarise.cc
+++ b/tests/core/Test_reunitarise.cc
@@ -103,7 +103,7 @@ int main (int argc, char ** argv)
 
   detU= Determinant(U) ;
   detU=detU-1.0;
-  std::cout << "Determinant before screw up " << norm2(detU)<<std::endl;
+  std::cout << "Determinant defect before screw up " << norm2(detU)<<std::endl;
 
   std::cout << " Screwing up determinant " << std::endl;
 

From fc6d07897fe4a9f7f1a80bd9a7849fe373648b42 Mon Sep 17 00:00:00 2001
From: Felix Erben <felix.erben@ed.ac.uk>
Date: Tue, 19 Jan 2021 12:32:48 +0000
Subject: [PATCH 16/16] revert changes

---
 tests/solver/Test_zMADWF_prec.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/solver/Test_zMADWF_prec.cc b/tests/solver/Test_zMADWF_prec.cc
index f18e1d86..d1168764 100644
--- a/tests/solver/Test_zMADWF_prec.cc
+++ b/tests/solver/Test_zMADWF_prec.cc
@@ -52,7 +52,7 @@ struct TestParams{
   bool zmobius_inner;
   double lambda_max; //upper bound of H_T eigenvalue range required to generate zMobius approximation
   
-  TestParams(): load_config(false), config_file("ckpoint_lat.1000"), mass(0.01),
+  TestParams(): load_config(true), config_file("ckpoint_lat.1000"), mass(0.01),
 		Ls_outer(24), b_plus_c_outer(2.0), resid_outer(1e-8),
 		Ls_inner(12), b_plus_c_inner(1.0), resid_inner(1e-8), zmobius_inner(true), lambda_max(1.42), outer_precon("Standard"), inner_precon("Standard")
   {}
@@ -246,7 +246,7 @@ void run(const TestParams &params){
   typename RunParamsInner::SchurSolverType SchurSolver_inner(CG_inner);
 
   ZeroGuesser<LatticeFermion> Guess;
-  MADWF<MobiusFermionD, ZMobiusFermionD, PVtype, typename RunParamsInner::SchurSolverType, ZeroGuesser<LatticeFermion> > madwf(D_outer, D_inner, PV_outer, SchurSolver_inner, Guess, params.resid_outer, 10000, &update);
+  MADWF<MobiusFermionD, ZMobiusFermionD, PVtype, typename RunParamsInner::SchurSolverType, ZeroGuesser<LatticeFermion> > madwf(D_outer, D_inner, PV_outer, SchurSolver_inner, Guess, params.resid_outer, 100, &update);
   
   LatticeFermionD result_MADWF(FGrid_outer);
   result_MADWF = Zero();