introduced A64FX Wilson kernels

2025-10-01 23:24:43 +01:00 · 2020-04-09 13:30:06 +02:00
parent 15238e8d5e
commit 77fa586f6c
5 changed files with 1987 additions and 0 deletions
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
@@ -0,0 +1,660 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#if defined(A64FXINTRIN)
+#pragma message("A64FX Wilson kernels intrin")
+#else
+#pragma message("A64FX Wilson kernels asm")
+#endif
+
+#if defined(A64FX)
+    ///////////////////////////////////////////////////////////
+    // If we are A64FX specialise the single precision routine
+    ///////////////////////////////////////////////////////////
+#if defined(A64FXINTRIN)
+#include <simd/Fujitsu_A64FX_intrin_single.h>
+#else
+#include <simd/Fujitsu_A64FX_asm_single.h>
+#endif
+
+
+/// Switch off the 5d vectorised code optimisations
+#undef DWFVEC5D
+
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void
+WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void
+WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef MAYBEPERM
+//#undef MULT_2SPIN
+#define MAYBEPERM(A,B)
+//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, undag Kernel, single
+/////////////////////////////////////////////////////////////////
+
+#ifdef DWFVEC5D
+
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+#undef  MULT_2SPIN
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
+template<> void
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, dag Kernel, single
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void
+WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#endif  // VEC 5D
+
+//#undef COMPLEX_SIGNS
+//#undef MAYBEPERM
+//#undef MULT_2SPIN
+
+// undefine everything
+#include <simd/Fujitsu_A64FX_undef.h>
+
+///////////////////////////////////////////////////////////
+// If we are A64FX specialise the double precision routine
+///////////////////////////////////////////////////////////
+
+#if defined(A64FXINTRIN)
+#include <simd/Fujitsu_A64FX_intrin_double.h>
+#else
+#include <simd/Fujitsu_A64FX_asm_double.h>
+#endif
+
+// KNL stuff
+//#define MAYBEPERM(A,perm) if (perm) { A ; }
+//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+//#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
+
+
+#define INTERIOR_AND_EXTERIOR
+#undef  INTERIOR
+#undef  EXTERIOR
+
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, undag Kernel, double
+/////////////////////////////////////////////////////////////////
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void
+WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+/////////////////////////////////////////////////////////////////
+// XYZT vectorised, dag Kernel, double
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+template<> void
+WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+
+// KNL stuff
+//#undef MAYBEPERM
+//#undef MULT_2SPIN
+#define MAYBEPERM(A,B)
+//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, undag Kernel, double
+/////////////////////////////////////////////////////////////////
+#ifdef DWFVEC5D
+
+#undef KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+#undef  MULT_2SPIN
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
+template<> void
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+/////////////////////////////////////////////////////////////////
+// Ls vectorised, dag Kernel, double
+/////////////////////////////////////////////////////////////////
+#define KERNEL_DAG
+#define INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#define INTERIOR
+#undef EXTERIOR
+template<> void
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#undef INTERIOR_AND_EXTERIOR
+#undef INTERIOR
+#define EXTERIOR
+
+template<> void
+WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+template<> void
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+template<> void
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+
+#endif  // VEC 5D
+
+// undefs
+#include <simd/Fujitsu_A64FX_undef.h>
+
+#endif //A64FX
--- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc
@@ -37,6 +37,7 @@ directory
 ////////////////////////////////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmQPX.h>
 NAMESPACE_END(Grid);

--- a/Grid/simd/Fujitsu_A64FX_asm_double.h
+++ b/Grid/simd/Fujitsu_A64FX_asm_double.h
@@ -0,0 +1,691 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: XXX
+
+    Copyright (C) 2020
+
+Author: Nils Meyer <nils.meyer@ur.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#define LOAD_CHIMU_A64FXd(x)           LOAD_CHIMU_INTERLEAVED_A64FXd(x)  
+#define PREFETCH_CHIMU_L1(A)  
+#define PREFETCH_GAUGE_L1(A)  
+#define PREFETCH_CHIMU_L2(A)  
+#define PREFETCH_GAUGE_L2(A)  
+#define PF_GAUGE(A)  
+#define PREFETCH1_CHIMU(A)  
+#define PREFETCH_CHIMU(A)  
+#define LOCK_GAUGE(A)  
+#define UNLOCK_GAUGE(A)  
+#define MASK_REGS                      DECLARATIONS_A64FXd(A)  
+#define COMPLEX_SIGNS(A)  
+#define LOAD64(A,B)  
+#define SAVE_RESULT(A,B)               RESULT_A64FXd(A)  
+#define MULT_2SPIN_DIR_PF(A,B)         MULT_2SPIN_A64FXd(A)  
+#define MAYBEPERM(A,perm)              if (perm) { A ; }  
+#define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)  
+#define ZERO_PSI                       ZERO_PSI_A64FXd  
+#define XP_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   XP_PROJ_A64FXd  
+#define YP_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   YP_PROJ_A64FXd  
+#define ZP_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   ZP_PROJ_A64FXd  
+#define TP_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   TP_PROJ_A64FXd  
+#define XM_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   XM_PROJ_A64FXd  
+#define YM_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   YM_PROJ_A64FXd  
+#define ZM_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   ZM_PROJ_A64FXd  
+#define TM_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   TM_PROJ_A64FXd  
+#define XP_RECON                       XP_RECON_A64FXd  
+#define XM_RECON                       XM_RECON_A64FXd  
+#define YM_RECON_ACCUM                 YM_RECON_ACCUM_A64FXd  
+#define ZM_RECON_ACCUM                 ZM_RECON_ACCUM_A64FXd  
+#define TM_RECON_ACCUM                 TM_RECON_ACCUM_A64FXd  
+#define XP_RECON_ACCUM                 XP_RECON_ACCUM_A64FXd  
+#define YP_RECON_ACCUM                 YP_RECON_ACCUM_A64FXd  
+#define ZP_RECON_ACCUM                 ZP_RECON_ACCUM_A64FXd  
+#define TP_RECON_ACCUM                 TP_RECON_ACCUM_A64FXd  
+#define PERMUTE_DIR0                   PERM0_A64FXd  
+#define PERMUTE_DIR1                   PERM1_A64FXd  
+#define PERMUTE_DIR2                   PERM2_A64FXd  
+#define PERMUTE_DIR3                   PERM3_A64FXd  
+// DECLARATIONS
+#define DECLARATIONS_A64FXd(x)  \
+    const uint64_t lut[4][8] = { \
+        {4, 5, 6, 7, 0, 1, 2, 3}, \
+        {2, 3, 0, 1, 6, 7, 4, 5}, \
+        {1, 0, 3, 2, 5, 4, 7, 6}, \
+        {0, 1, 2, 4, 5, 6, 7, 8} };\
+asm ( \
+    "fmov z31.d , 0 \n\t" \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// RESULT
+#define RESULT_A64FXd(base)  \
+{ \
+asm ( \
+    "stnt1d { z0.d }, p5, [%[storeptr], -6, mul vl] \n\t" \
+    "stnt1d { z1.d }, p5, [%[storeptr], -5, mul vl] \n\t" \
+    "stnt1d { z2.d }, p5, [%[storeptr], -4, mul vl] \n\t" \
+    "stnt1d { z3.d }, p5, [%[storeptr], -3, mul vl] \n\t" \
+    "stnt1d { z4.d }, p5, [%[storeptr], -2, mul vl] \n\t" \
+    "stnt1d { z5.d }, p5, [%[storeptr], -1, mul vl] \n\t" \
+    "stnt1d { z6.d }, p5, [%[storeptr], 0, mul vl] \n\t" \
+    "stnt1d { z7.d }, p5, [%[storeptr], 1, mul vl] \n\t" \
+    "stnt1d { z8.d }, p5, [%[storeptr], 2, mul vl] \n\t" \
+    "stnt1d { z9.d }, p5, [%[storeptr], 3, mul vl] \n\t" \
+    "stnt1d { z10.d }, p5, [%[storeptr], 4, mul vl] \n\t" \
+    "stnt1d { z11.d }, p5, [%[storeptr], 5, mul vl] \n\t" \
+    :  \
+    : [storeptr] "r" (base + 2 * 3 * 64) \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
+); \
+}
+// PREFETCH_CHIMU_L2 (prefetch to L2)
+#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \
+{ \
+asm ( \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
+    :  \
+    : [fetchptr] "r" (base) \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
+); \
+}
+// PREFETCH_CHIMU_L1 (prefetch to L1)
+#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \
+{ \
+asm ( \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
+    :  \
+    : [fetchptr] "r" (base) \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
+); \
+}
+// PREFETCH_GAUGE_L2 (prefetch to L2)
+#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
+{ \
+    const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
+asm ( \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
+    :  \
+    : [fetchptr] "r" (baseU) \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
+); \
+}
+// PREFETCH_GAUGE_L1 (prefetch to L1)
+#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd  \
+{ \
+    const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
+asm ( \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
+    :  \
+    : [fetchptr] "r" (baseU) \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
+); \
+}
+// LOAD_CHI
+#define LOAD_CHI_A64FXd(base)  \
+{ \
+asm ( \
+    "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    :  \
+    : [fetchptr] "r" (base + 2 * 3 * 64) \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
+); \
+}
+// LOAD_CHIMU
+#define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \
+{ \
+asm ( \
+    "ptrue p5.d \n\t" \
+    "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
+    "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
+    "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
+    "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
+    :  \
+    : [fetchptr] "r" (base + 2 * 3 * 64) \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
+); \
+}
+// LOAD_CHIMU_0213
+#define LOAD_CHIMU_0213_A64FXd  \
+{ \
+    const SiteSpinor & ref(in[offset]); \
+asm ( \
+    "ptrue p5.d \n\t" \
+    "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
+    "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
+    "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
+    "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
+    :  \
+    : [fetchptr] "r" (&ref[2][0]) \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
+); \
+}
+// LOAD_CHIMU_0312
+#define LOAD_CHIMU_0312_A64FXd  \
+{ \
+    const SiteSpinor & ref(in[offset]); \
+asm ( \
+    "ptrue p5.d \n\t" \
+    "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1d { z27.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \
+    "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1d { z28.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \
+    "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1d { z29.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \
+    "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1d { z24.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1d { z25.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
+    "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
+    :  \
+    : [fetchptr] "r" (&ref[2][0]) \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
+); \
+}
+// PERM0
+#define PERM0_A64FXd  \
+asm ( \
+    "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
+    "tbl z12.d, { z12.d }, z30.d \n\t"  \
+    "tbl z13.d, { z13.d }, z30.d \n\t"  \
+    "tbl z14.d, { z14.d }, z30.d \n\t"  \
+    "tbl z15.d, { z15.d }, z30.d \n\t"  \
+    "tbl z16.d, { z16.d }, z30.d \n\t"  \
+    "tbl z17.d, { z17.d }, z30.d \n\t"  \
+    :  \
+    : [tableptr] "r" (&lut[0]),[index] "i" (0) \
+    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// PERM1
+#define PERM1_A64FXd  \
+asm ( \
+    "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
+    "tbl z12.d, { z12.d }, z30.d \n\t"  \
+    "tbl z13.d, { z13.d }, z30.d \n\t"  \
+    "tbl z14.d, { z14.d }, z30.d \n\t"  \
+    "tbl z15.d, { z15.d }, z30.d \n\t"  \
+    "tbl z16.d, { z16.d }, z30.d \n\t"  \
+    "tbl z17.d, { z17.d }, z30.d \n\t"  \
+    :  \
+    : [tableptr] "r" (&lut[0]),[index] "i" (1) \
+    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// PERM2
+#define PERM2_A64FXd  \
+asm ( \
+    "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
+    "tbl z12.d, { z12.d }, z30.d \n\t"  \
+    "tbl z13.d, { z13.d }, z30.d \n\t"  \
+    "tbl z14.d, { z14.d }, z30.d \n\t"  \
+    "tbl z15.d, { z15.d }, z30.d \n\t"  \
+    "tbl z16.d, { z16.d }, z30.d \n\t"  \
+    "tbl z17.d, { z17.d }, z30.d \n\t"  \
+    :  \
+    : [tableptr] "r" (&lut[0]),[index] "i" (2) \
+    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// PERM3
+#define PERM3_A64FXd  
+
+// MULT_2SPIN
+#define MULT_2SPIN_A64FXd(A)  \
+{ \
+    const auto & ref(U[sU][A]); \
+asm ( \
+    "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \
+    "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \
+    "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \
+    "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \
+    "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \
+    "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \
+    "fmov z18.d , 0 \n\t" \
+    "fmov z21.d , 0 \n\t" \
+    "fmov z19.d , 0 \n\t" \
+    "fmov z22.d , 0 \n\t" \
+    "fmov z20.d , 0 \n\t" \
+    "fmov z23.d , 0 \n\t" \
+    "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \
+    "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \
+    "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \
+    "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \
+    "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \
+    "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \
+    "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \
+    "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \
+    "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \
+    "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \
+    "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \
+    "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \
+    "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \
+    "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \
+    "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \
+    "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
+    "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
+    "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
+    "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \
+    "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \
+    "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \
+    "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \
+    "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \
+    "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \
+    "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \
+    "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \
+    "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \
+    "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \
+    "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \
+    "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \
+    "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \
+    "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \
+    "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \
+    "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \
+    "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \
+    "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \
+    "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \
+    "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
+    "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
+    :  \
+    : [fetchptr] "r" ((uint64_t)&ref[2][0]) \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
+); \
+}
+// XP_PROJ
+#define XP_PROJ_A64FXd  \
+{ \
+asm ( \
+    "fcadd z12.d, p5/m, z12.d, z27.d, 90 \n\t" \
+    "fcadd z13.d, p5/m, z13.d, z28.d, 90 \n\t" \
+    "fcadd z14.d, p5/m, z14.d, z29.d, 90 \n\t" \
+    "fcadd z15.d, p5/m, z15.d, z24.d, 90 \n\t" \
+    "fcadd z16.d, p5/m, z16.d, z25.d, 90 \n\t" \
+    "fcadd z17.d, p5/m, z17.d, z26.d, 90 \n\t" \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); \
+}
+// XP_RECON
+#define XP_RECON_A64FXd  \
+asm ( \
+    "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
+    "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
+    "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
+    "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
+    "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
+    "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
+    "mov z0.d, z18.d \n\t" \
+    "mov z1.d, z19.d \n\t" \
+    "mov z2.d, z20.d \n\t" \
+    "mov z3.d, z21.d \n\t" \
+    "mov z4.d, z22.d \n\t" \
+    "mov z5.d, z23.d \n\t" \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// XP_RECON_ACCUM
+#define XP_RECON_ACCUM_A64FXd  \
+asm ( \
+    "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
+    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
+    "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
+    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
+    "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
+    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
+    "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
+    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
+    "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
+    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
+    "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
+    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// YP_PROJ
+#define YP_PROJ_A64FXd  \
+{ \
+asm ( \
+    "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
+    "fsub z12.d, p5/m, z12.d, z27.d \n\t" \
+    "fsub z13.d, p5/m, z13.d, z28.d \n\t" \
+    "fsub z14.d, p5/m, z14.d, z29.d \n\t" \
+    "fadd z15.d, p5/m, z15.d, z24.d \n\t"  \
+    "fadd z16.d, p5/m, z16.d, z25.d \n\t"  \
+    "fadd z17.d, p5/m, z17.d, z26.d \n\t"  \
+    :  \
+    : [tableptr] "r" (&lut[0]),[index] "i" (2) \
+    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); \
+}
+// ZP_PROJ
+#define ZP_PROJ_A64FXd  \
+{ \
+asm ( \
+    "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
+    "fcadd z12.d, p5/m, z12.d, z24.d, 90 \n\t" \
+    "fcadd z13.d, p5/m, z13.d, z25.d, 90 \n\t" \
+    "fcadd z14.d, p5/m, z14.d, z26.d, 90 \n\t" \
+    "fcadd z15.d, p5/m, z15.d, z27.d, 270 \n\t" \
+    "fcadd z16.d, p5/m, z16.d, z28.d, 270 \n\t" \
+    "fcadd z17.d, p5/m, z17.d, z29.d, 270 \n\t" \
+    :  \
+    : [tableptr] "r" (&lut[0]),[index] "i" (1) \
+    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); \
+}
+// TP_PROJ
+#define TP_PROJ_A64FXd  \
+{ \
+asm ( \
+    "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
+    "fadd z12.d, p5/m, z12.d, z24.d \n\t"  \
+    "fadd z13.d, p5/m, z13.d, z25.d \n\t"  \
+    "fadd z14.d, p5/m, z14.d, z26.d \n\t"  \
+    "fadd z15.d, p5/m, z15.d, z27.d \n\t"  \
+    "fadd z16.d, p5/m, z16.d, z28.d \n\t"  \
+    "fadd z17.d, p5/m, z17.d, z29.d \n\t"  \
+    :  \
+    : [tableptr] "r" (&lut[0]),[index] "i" (0) \
+    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); \
+}
+// XM_PROJ
+#define XM_PROJ_A64FXd  \
+{ \
+asm ( \
+    "fcadd z12.d, p5/m, z12.d, z27.d, 270 \n\t" \
+    "fcadd z13.d, p5/m, z13.d, z28.d, 270 \n\t" \
+    "fcadd z14.d, p5/m, z14.d, z29.d, 270 \n\t" \
+    "fcadd z15.d, p5/m, z15.d, z24.d, 270 \n\t" \
+    "fcadd z16.d, p5/m, z16.d, z25.d, 270 \n\t" \
+    "fcadd z17.d, p5/m, z17.d, z26.d, 270 \n\t" \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); \
+}
+// XM_RECON
+#define XM_RECON_A64FXd  \
+asm ( \
+    "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
+    "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
+    "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
+    "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
+    "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
+    "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
+    "mov z0.d, z18.d \n\t" \
+    "mov z1.d, z19.d \n\t" \
+    "mov z2.d, z20.d \n\t" \
+    "mov z3.d, z21.d \n\t" \
+    "mov z4.d, z22.d \n\t" \
+    "mov z5.d, z23.d \n\t" \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// YM_PROJ
+#define YM_PROJ_A64FXd  \
+{ \
+asm ( \
+    "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
+    "fadd z12.d, p5/m, z12.d, z27.d \n\t"  \
+    "fadd z13.d, p5/m, z13.d, z28.d \n\t"  \
+    "fadd z14.d, p5/m, z14.d, z29.d \n\t"  \
+    "fsub z15.d, p5/m, z15.d, z24.d \n\t" \
+    "fsub z16.d, p5/m, z16.d, z25.d \n\t" \
+    "fsub z17.d, p5/m, z17.d, z26.d \n\t" \
+    :  \
+    : [tableptr] "r" (&lut[0]),[index] "i" (2) \
+    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); \
+}
+// ZM_PROJ
+#define ZM_PROJ_A64FXd  \
+{ \
+asm ( \
+    "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
+    "fcadd z12.d, p5/m, z12.d, z24.d, 270 \n\t" \
+    "fcadd z13.d, p5/m, z13.d, z25.d, 270 \n\t" \
+    "fcadd z14.d, p5/m, z14.d, z26.d, 270 \n\t" \
+    "fcadd z15.d, p5/m, z15.d, z27.d, 90 \n\t" \
+    "fcadd z16.d, p5/m, z16.d, z28.d, 90 \n\t" \
+    "fcadd z17.d, p5/m, z17.d, z29.d, 90 \n\t" \
+    :  \
+    : [tableptr] "r" (&lut[0]),[index] "i" (1) \
+    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); \
+}
+// TM_PROJ
+#define TM_PROJ_A64FXd  \
+{ \
+asm ( \
+    "ld1d { z30.d }, p5/z, [%[tableptr], %[index], mul vl] \n\t" \
+    "fsub z12.d, p5/m, z12.d, z24.d \n\t" \
+    "fsub z13.d, p5/m, z13.d, z25.d \n\t" \
+    "fsub z14.d, p5/m, z14.d, z26.d \n\t" \
+    "fsub z15.d, p5/m, z15.d, z27.d \n\t" \
+    "fsub z16.d, p5/m, z16.d, z28.d \n\t" \
+    "fsub z17.d, p5/m, z17.d, z29.d \n\t" \
+    :  \
+    : [tableptr] "r" (&lut[0]),[index] "i" (0) \
+    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); \
+}
+// XM_RECON_ACCUM
+#define XM_RECON_ACCUM_A64FXd  \
+asm ( \
+    "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
+    "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
+    "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
+    "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
+    "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
+    "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
+    "mov z0.d, z18.d \n\t" \
+    "mov z1.d, z19.d \n\t" \
+    "mov z2.d, z20.d \n\t" \
+    "mov z3.d, z21.d \n\t" \
+    "mov z4.d, z22.d \n\t" \
+    "mov z5.d, z23.d \n\t" \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// YP_RECON_ACCUM
+#define YP_RECON_ACCUM_A64FXd  \
+asm ( \
+    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
+    "fsub z9.d, p5/m, z9.d, z18.d \n\t" \
+    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
+    "fsub z10.d, p5/m, z10.d, z19.d \n\t" \
+    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
+    "fsub z11.d, p5/m, z11.d, z20.d \n\t" \
+    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
+    "fadd z6.d, p5/m, z6.d, z21.d \n\t"  \
+    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
+    "fadd z7.d, p5/m, z7.d, z22.d \n\t"  \
+    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
+    "fadd z8.d, p5/m, z8.d, z23.d \n\t"  \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// YM_RECON_ACCUM
+#define YM_RECON_ACCUM_A64FXd  \
+asm ( \
+    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
+    "fadd z9.d, p5/m, z9.d, z18.d \n\t"  \
+    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
+    "fadd z10.d, p5/m, z10.d, z19.d \n\t"  \
+    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
+    "fadd z11.d, p5/m, z11.d, z20.d \n\t"  \
+    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
+    "fsub z6.d, p5/m, z6.d, z21.d \n\t" \
+    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
+    "fsub z7.d, p5/m, z7.d, z22.d \n\t" \
+    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
+    "fsub z8.d, p5/m, z8.d, z23.d \n\t" \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// ZP_RECON_ACCUM
+#define ZP_RECON_ACCUM_A64FXd  \
+asm ( \
+    "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \
+    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
+    "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \
+    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
+    "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \
+    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
+    "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \
+    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
+    "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \
+    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
+    "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \
+    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// ZM_RECON_ACCUM
+#define ZM_RECON_ACCUM_A64FXd  \
+asm ( \
+    "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \
+    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
+    "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \
+    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
+    "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \
+    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
+    "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \
+    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
+    "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \
+    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
+    "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \
+    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// TP_RECON_ACCUM
+#define TP_RECON_ACCUM_A64FXd  \
+asm ( \
+    "ptrue p5.d \n\t" \
+    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
+    "fadd z6.d, p5/m, z6.d, z18.d \n\t"  \
+    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
+    "fadd z7.d, p5/m, z7.d, z19.d \n\t"  \
+    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
+    "fadd z8.d, p5/m, z8.d, z20.d \n\t"  \
+    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
+    "fadd z9.d, p5/m, z9.d, z21.d \n\t"  \
+    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
+    "fadd z10.d, p5/m, z10.d, z22.d \n\t"  \
+    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
+    "fadd z11.d, p5/m, z11.d, z23.d \n\t"  \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// TM_RECON_ACCUM
+#define TM_RECON_ACCUM_A64FXd  \
+asm ( \
+    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
+    "fsub z6.d, p5/m, z6.d, z18.d \n\t" \
+    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
+    "fsub z7.d, p5/m, z7.d, z19.d \n\t" \
+    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
+    "fsub z8.d, p5/m, z8.d, z20.d \n\t" \
+    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
+    "fsub z9.d, p5/m, z9.d, z21.d \n\t" \
+    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
+    "fsub z10.d, p5/m, z10.d, z22.d \n\t" \
+    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
+    "fsub z11.d, p5/m, z11.d, z23.d \n\t" \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
+// ZERO_PSI
+#define ZERO_PSI_A64FXd  \
+asm ( \
+    "ptrue p5.d \n\t" \
+    "fmov z0.d , 0 \n\t" \
+    "fmov z1.d , 0 \n\t" \
+    "fmov z2.d , 0 \n\t" \
+    "fmov z3.d , 0 \n\t" \
+    "fmov z4.d , 0 \n\t" \
+    "fmov z5.d , 0 \n\t" \
+    "fmov z6.d , 0 \n\t" \
+    "fmov z7.d , 0 \n\t" \
+    "fmov z8.d , 0 \n\t" \
+    "fmov z9.d , 0 \n\t" \
+    "fmov z10.d , 0 \n\t" \
+    "fmov z11.d , 0 \n\t" \
+    :  \
+    :  \
+    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
+); 
+
--- a/Grid/simd/Fujitsu_A64FX_intrin_double.h
+++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h
@@ -0,0 +1,567 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: XXX
+
+    Copyright (C) 2020
+
+Author: Nils Meyer <nils.meyer@ur.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#define LOAD_CHIMU_A64FXd(x)           LOAD_CHIMU_INTERLEAVED_A64FXd(x)  
+#define PREFETCH_CHIMU_L1(A)  
+#define PREFETCH_GAUGE_L1(A)  
+#define PREFETCH_CHIMU_L2(A)  
+#define PREFETCH_GAUGE_L2(A)  
+#define PF_GAUGE(A)  
+#define PREFETCH1_CHIMU(A)  
+#define PREFETCH_CHIMU(A)  
+#define LOCK_GAUGE(A)  
+#define UNLOCK_GAUGE(A)  
+#define MASK_REGS                      DECLARATIONS_A64FXd(A)  
+#define COMPLEX_SIGNS(A)  
+#define LOAD64(A,B)  
+#define SAVE_RESULT(A,B)               RESULT_A64FXd(A)  
+#define MULT_2SPIN_DIR_PF(A,B)         MULT_2SPIN_A64FXd(A)  
+#define MAYBEPERM(A,perm)              if (perm) { A ; }  
+#define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)  
+#define ZERO_PSI                       ZERO_PSI_A64FXd  
+#define XP_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   XP_PROJ_A64FXd  
+#define YP_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   YP_PROJ_A64FXd  
+#define ZP_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   ZP_PROJ_A64FXd  
+#define TP_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   TP_PROJ_A64FXd  
+#define XM_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   XM_PROJ_A64FXd  
+#define YM_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   YM_PROJ_A64FXd  
+#define ZM_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   ZM_PROJ_A64FXd  
+#define TM_PROJMEM(base)               LOAD_CHIMU_A64FXd(base);   TM_PROJ_A64FXd  
+#define XP_RECON                       XP_RECON_A64FXd  
+#define XM_RECON                       XM_RECON_A64FXd  
+#define YM_RECON_ACCUM                 YM_RECON_ACCUM_A64FXd  
+#define ZM_RECON_ACCUM                 ZM_RECON_ACCUM_A64FXd  
+#define TM_RECON_ACCUM                 TM_RECON_ACCUM_A64FXd  
+#define XP_RECON_ACCUM                 XP_RECON_ACCUM_A64FXd  
+#define YP_RECON_ACCUM                 YP_RECON_ACCUM_A64FXd  
+#define ZP_RECON_ACCUM                 ZP_RECON_ACCUM_A64FXd  
+#define TP_RECON_ACCUM                 TP_RECON_ACCUM_A64FXd  
+#define PERMUTE_DIR0                   PERM0_A64FXd  
+#define PERMUTE_DIR1                   PERM1_A64FXd  
+#define PERMUTE_DIR2                   PERM2_A64FXd  
+#define PERMUTE_DIR3                   PERM3_A64FXd  
+// DECLARATIONS
+#define DECLARATIONS_A64FXd(x)  \
+    const uint64_t lut[4][8] = { \
+        {4, 5, 6, 7, 0, 1, 2, 3}, \
+        {2, 3, 0, 1, 6, 7, 4, 5}, \
+        {1, 0, 3, 2, 5, 4, 7, 6}, \
+        {0, 1, 2, 4, 5, 6, 7, 8} };\
+    svfloat64_t result_00;        \
+    svfloat64_t result_01;        \
+    svfloat64_t result_02;        \
+    svfloat64_t result_10;        \
+    svfloat64_t result_11;        \
+    svfloat64_t result_12;        \
+    svfloat64_t result_20;        \
+    svfloat64_t result_21;        \
+    svfloat64_t result_22;        \
+    svfloat64_t result_30;        \
+    svfloat64_t result_31;        \
+    svfloat64_t result_32;        \
+    svfloat64_t Chi_00;        \
+    svfloat64_t Chi_01;        \
+    svfloat64_t Chi_02;        \
+    svfloat64_t Chi_10;        \
+    svfloat64_t Chi_11;        \
+    svfloat64_t Chi_12;        \
+    svfloat64_t UChi_00;        \
+    svfloat64_t UChi_01;        \
+    svfloat64_t UChi_02;        \
+    svfloat64_t UChi_10;        \
+    svfloat64_t UChi_11;        \
+    svfloat64_t UChi_12;        \
+    svfloat64_t U_00;        \
+    svfloat64_t U_10;        \
+    svfloat64_t U_20;        \
+    svfloat64_t U_01;        \
+    svfloat64_t U_11;        \
+    svfloat64_t U_21;        \
+    svbool_t pg1;        \
+    pg1 = svptrue_b64();        \
+    svuint64_t table0; \
+    svfloat64_t zero0;        \
+    zero0 = __svzero(zero0); 
+
+#define Chimu_00 Chi_00  
+#define Chimu_01 Chi_01  
+#define Chimu_02 Chi_02  
+#define Chimu_10 Chi_10  
+#define Chimu_11 Chi_11  
+#define Chimu_12 Chi_12  
+#define Chimu_20 U_00  
+#define Chimu_21 U_10  
+#define Chimu_22 U_20  
+#define Chimu_30 U_01  
+#define Chimu_31 U_11  
+#define Chimu_32 U_21  
+// RESULT
+#define RESULT_A64FXd(base)  \
+{ \
+    svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00);  \
+    svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01);  \
+    svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02);  \
+    svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10);  \
+    svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11);  \
+    svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12);  \
+    svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20);  \
+    svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21);  \
+    svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22);  \
+    svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30);  \
+    svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31);  \
+    svstnt1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32);  \
+}
+// PREFETCH_CHIMU_L2 (prefetch to L2)
+#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \
+{ \
+    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
+}
+// PREFETCH_CHIMU_L1 (prefetch to L1)
+#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \
+{ \
+    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
+    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
+    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
+}
+// PREFETCH_GAUGE_L2 (prefetch to L2)
+#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
+{ \
+    const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
+    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
+}
+// PREFETCH_GAUGE_L1 (prefetch to L1)
+#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd  \
+{ \
+    const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
+    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
+}
+// LOAD_CHI
+#define LOAD_CHI_A64FXd(base)  \
+{ \
+    Chi_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
+    Chi_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
+    Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
+    Chi_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
+    Chi_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
+    Chi_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
+}
+// LOAD_CHIMU
+#define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \
+{ \
+    Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
+    Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \
+    Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
+    Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \
+    Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
+    Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \
+    Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
+    Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \
+    Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
+    Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \
+    Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
+    Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \
+}
+// LOAD_CHIMU_0213
+#define LOAD_CHIMU_0213_A64FXd  \
+{ \
+    const SiteSpinor & ref(in[offset]); \
+    Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
+    Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \
+    Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
+    Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \
+    Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
+    Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \
+    Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
+    Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \
+    Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
+    Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \
+    Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
+    Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \
+}
+// LOAD_CHIMU_0312
+#define LOAD_CHIMU_0312_A64FXd  \
+{ \
+    const SiteSpinor & ref(in[offset]); \
+    Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
+    Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \
+    Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
+    Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \
+    Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
+    Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \
+    Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
+    Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \
+    Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
+    Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \
+    Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
+    Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \
+}
+// PERM0
+#define PERM0_A64FXd  \
+    table0 = svld1(pg1, (uint64_t*)&lut[0]);  \
+    Chi_00 = svtbl(Chi_00, table0);    \
+    Chi_01 = svtbl(Chi_01, table0);    \
+    Chi_02 = svtbl(Chi_02, table0);    \
+    Chi_10 = svtbl(Chi_10, table0);    \
+    Chi_11 = svtbl(Chi_11, table0);    \
+    Chi_12 = svtbl(Chi_12, table0);    
+
+// PERM1
+#define PERM1_A64FXd  \
+    table0 = svld1(pg1, (uint64_t*)&lut[1]);  \
+    Chi_00 = svtbl(Chi_00, table0);    \
+    Chi_01 = svtbl(Chi_01, table0);    \
+    Chi_02 = svtbl(Chi_02, table0);    \
+    Chi_10 = svtbl(Chi_10, table0);    \
+    Chi_11 = svtbl(Chi_11, table0);    \
+    Chi_12 = svtbl(Chi_12, table0);    
+
+// PERM2
+#define PERM2_A64FXd  \
+    table0 = svld1(pg1, (uint64_t*)&lut[2]);  \
+    Chi_00 = svtbl(Chi_00, table0);    \
+    Chi_01 = svtbl(Chi_01, table0);    \
+    Chi_02 = svtbl(Chi_02, table0);    \
+    Chi_10 = svtbl(Chi_10, table0);    \
+    Chi_11 = svtbl(Chi_11, table0);    \
+    Chi_12 = svtbl(Chi_12, table0);    
+
+// PERM3
+#define PERM3_A64FXd  
+
+// MULT_2SPIN
+#define MULT_2SPIN_A64FXd(A)  \
+{ \
+    const auto & ref(U[sU][A]); \
+    U_00 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -6 * 64));  \
+    U_10 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -3 * 64));  \
+    U_20 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 0 * 64));  \
+    U_01 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -5 * 64));  \
+    U_11 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -2 * 64));  \
+    U_21 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 1 * 64));  \
+    UChi_00 = __svzero(UChi_00); \
+    UChi_10 = __svzero(UChi_10); \
+    UChi_01 = __svzero(UChi_01); \
+    UChi_11 = __svzero(UChi_11); \
+    UChi_02 = __svzero(UChi_02); \
+    UChi_12 = __svzero(UChi_12); \
+    UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 0); \
+    UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 0); \
+    UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 0); \
+    UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 0); \
+    UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 0); \
+    UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 0); \
+    UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \
+    UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \
+    UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \
+    UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
+    UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
+    UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
+    U_00 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -4 * 64));  \
+    U_10 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + -1 * 64));  \
+    U_20 = svld1(pg1, (float64_t*)((uint64_t)&ref[2][0] + 2 * 64));  \
+    UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
+    UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
+    UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
+    UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \
+    UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \
+    UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \
+    UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \
+    UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \
+    UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \
+    UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \
+    UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \
+    UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \
+    UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \
+    UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \
+    UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \
+    UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \
+    UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \
+    UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \
+    UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \
+    UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \
+    UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \
+    UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \
+    UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \
+    UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \
+}
+// XP_PROJ
+#define XP_PROJ_A64FXd  \
+{ \
+    Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90);   \
+    Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90);   \
+    Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90);   \
+    Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90);   \
+    Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90);   \
+    Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90);   \
+}
+// XP_RECON
+#define XP_RECON_A64FXd  \
+    result_20 = svcadd_x(pg1, result_20, UChi_10, 270);   \
+    result_21 = svcadd_x(pg1, result_21, UChi_11, 270);   \
+    result_22 = svcadd_x(pg1, result_22, UChi_12, 270);   \
+    result_30 = svcadd_x(pg1, result_30, UChi_00, 270);   \
+    result_31 = svcadd_x(pg1, result_31, UChi_01, 270);   \
+    result_32 = svcadd_x(pg1, result_32, UChi_02, 270);   \
+    result_00 = UChi_00;        \
+    result_01 = UChi_01;        \
+    result_02 = UChi_02;        \
+    result_10 = UChi_10;        \
+    result_11 = UChi_11;        \
+    result_12 = UChi_12;        
+
+// XP_RECON_ACCUM
+#define XP_RECON_ACCUM_A64FXd  \
+    result_30 = svcadd_x(pg1, result_30, UChi_00, 270);   \
+    result_00 = svadd_x(pg1, result_00, UChi_00); \
+    result_31 = svcadd_x(pg1, result_31, UChi_01, 270);   \
+    result_01 = svadd_x(pg1, result_01, UChi_01); \
+    result_32 = svcadd_x(pg1, result_32, UChi_02, 270);   \
+    result_02 = svadd_x(pg1, result_02, UChi_02); \
+    result_20 = svcadd_x(pg1, result_20, UChi_10, 270);   \
+    result_10 = svadd_x(pg1, result_10, UChi_10); \
+    result_21 = svcadd_x(pg1, result_21, UChi_11, 270);   \
+    result_11 = svadd_x(pg1, result_11, UChi_11); \
+    result_22 = svcadd_x(pg1, result_22, UChi_12, 270);   \
+    result_12 = svadd_x(pg1, result_12, UChi_12); 
+
+// YP_PROJ
+#define YP_PROJ_A64FXd  \
+{ \
+    table0 = svld1(pg1, (uint64_t*)&lut[2]);  \
+    Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30);  \
+    Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31);  \
+    Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32);  \
+    Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20);  \
+    Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21);  \
+    Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22);  \
+}
+// ZP_PROJ
+#define ZP_PROJ_A64FXd  \
+{ \
+    table0 = svld1(pg1, (uint64_t*)&lut[1]);  \
+    Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90);   \
+    Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90);   \
+    Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90);   \
+    Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270);   \
+    Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270);   \
+    Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270);   \
+}
+// TP_PROJ
+#define TP_PROJ_A64FXd  \
+{ \
+    table0 = svld1(pg1, (uint64_t*)&lut[0]);  \
+    Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20);  \
+    Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21);  \
+    Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22);  \
+    Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30);  \
+    Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31);  \
+    Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32);  \
+}
+// XM_PROJ
+#define XM_PROJ_A64FXd  \
+{ \
+    Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270);   \
+    Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270);   \
+    Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270);   \
+    Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270);   \
+    Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270);   \
+    Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270);   \
+}
+// XM_RECON
+#define XM_RECON_A64FXd  \
+    result_20 = svcadd_x(pg1, result_20, UChi_10, 90);   \
+    result_21 = svcadd_x(pg1, result_21, UChi_11, 90);   \
+    result_22 = svcadd_x(pg1, result_22, UChi_12, 90);   \
+    result_30 = svcadd_x(pg1, result_30, UChi_00, 90);   \
+    result_31 = svcadd_x(pg1, result_31, UChi_01, 90);   \
+    result_32 = svcadd_x(pg1, result_32, UChi_02, 90);   \
+    result_00 = UChi_00;        \
+    result_01 = UChi_01;        \
+    result_02 = UChi_02;        \
+    result_10 = UChi_10;        \
+    result_11 = UChi_11;        \
+    result_12 = UChi_12;        
+
+// YM_PROJ
+#define YM_PROJ_A64FXd  \
+{ \
+    table0 = svld1(pg1, (uint64_t*)&lut[2]);  \
+    Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30);  \
+    Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31);  \
+    Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32);  \
+    Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20);  \
+    Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21);  \
+    Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22);  \
+}
+// ZM_PROJ
+#define ZM_PROJ_A64FXd  \
+{ \
+    table0 = svld1(pg1, (uint64_t*)&lut[1]);  \
+    Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270);   \
+    Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270);   \
+    Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270);   \
+    Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90);   \
+    Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90);   \
+    Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90);   \
+}
+// TM_PROJ
+#define TM_PROJ_A64FXd  \
+{ \
+    table0 = svld1(pg1, (uint64_t*)&lut[0]);  \
+    Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20);  \
+    Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21);  \
+    Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22);  \
+    Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30);  \
+    Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31);  \
+    Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32);  \
+}
+// XM_RECON_ACCUM
+#define XM_RECON_ACCUM_A64FXd  \
+    result_30 = svcadd_x(pg1, result_30, UChi_00, 90);   \
+    result_31 = svcadd_x(pg1, result_31, UChi_01, 90);   \
+    result_32 = svcadd_x(pg1, result_32, UChi_02, 90);   \
+    result_20 = svcadd_x(pg1, result_20, UChi_10, 90);   \
+    result_21 = svcadd_x(pg1, result_21, UChi_11, 90);   \
+    result_22 = svcadd_x(pg1, result_22, UChi_12, 90);   \
+    result_00 = UChi_00;        \
+    result_01 = UChi_01;        \
+    result_02 = UChi_02;        \
+    result_10 = UChi_10;        \
+    result_11 = UChi_11;        \
+    result_12 = UChi_12;        
+
+// YP_RECON_ACCUM
+#define YP_RECON_ACCUM_A64FXd  \
+    result_00 = svadd_x(pg1, result_00, UChi_00); \
+    result_30 = svsub_x(pg1, result_30, UChi_00); \
+    result_01 = svadd_x(pg1, result_01, UChi_01); \
+    result_31 = svsub_x(pg1, result_31, UChi_01); \
+    result_02 = svadd_x(pg1, result_02, UChi_02); \
+    result_32 = svsub_x(pg1, result_32, UChi_02); \
+    result_10 = svadd_x(pg1, result_10, UChi_10); \
+    result_20 = svadd_x(pg1, result_20, UChi_10); \
+    result_11 = svadd_x(pg1, result_11, UChi_11); \
+    result_21 = svadd_x(pg1, result_21, UChi_11); \
+    result_12 = svadd_x(pg1, result_12, UChi_12); \
+    result_22 = svadd_x(pg1, result_22, UChi_12); 
+
+// YM_RECON_ACCUM
+#define YM_RECON_ACCUM_A64FXd  \
+    result_00 = svadd_x(pg1, result_00, UChi_00); \
+    result_30 = svadd_x(pg1, result_30, UChi_00); \
+    result_01 = svadd_x(pg1, result_01, UChi_01); \
+    result_31 = svadd_x(pg1, result_31, UChi_01); \
+    result_02 = svadd_x(pg1, result_02, UChi_02); \
+    result_32 = svadd_x(pg1, result_32, UChi_02); \
+    result_10 = svadd_x(pg1, result_10, UChi_10); \
+    result_20 = svsub_x(pg1, result_20, UChi_10); \
+    result_11 = svadd_x(pg1, result_11, UChi_11); \
+    result_21 = svsub_x(pg1, result_21, UChi_11); \
+    result_12 = svadd_x(pg1, result_12, UChi_12); \
+    result_22 = svsub_x(pg1, result_22, UChi_12); 
+
+// ZP_RECON_ACCUM
+#define ZP_RECON_ACCUM_A64FXd  \
+    result_20 = svcadd_x(pg1, result_20, UChi_00, 270);   \
+    result_00 = svadd_x(pg1, result_00, UChi_00); \
+    result_21 = svcadd_x(pg1, result_21, UChi_01, 270);   \
+    result_01 = svadd_x(pg1, result_01, UChi_01); \
+    result_22 = svcadd_x(pg1, result_22, UChi_02, 270);   \
+    result_02 = svadd_x(pg1, result_02, UChi_02); \
+    result_30 = svcadd_x(pg1, result_30, UChi_10, 90);   \
+    result_10 = svadd_x(pg1, result_10, UChi_10); \
+    result_31 = svcadd_x(pg1, result_31, UChi_11, 90);   \
+    result_11 = svadd_x(pg1, result_11, UChi_11); \
+    result_32 = svcadd_x(pg1, result_32, UChi_12, 90);   \
+    result_12 = svadd_x(pg1, result_12, UChi_12); 
+
+// ZM_RECON_ACCUM
+#define ZM_RECON_ACCUM_A64FXd  \
+    result_20 = svcadd_x(pg1, result_20, UChi_00, 90);   \
+    result_00 = svadd_x(pg1, result_00, UChi_00); \
+    result_21 = svcadd_x(pg1, result_21, UChi_01, 90);   \
+    result_01 = svadd_x(pg1, result_01, UChi_01); \
+    result_22 = svcadd_x(pg1, result_22, UChi_02, 90);   \
+    result_02 = svadd_x(pg1, result_02, UChi_02); \
+    result_30 = svcadd_x(pg1, result_30, UChi_10, 270);   \
+    result_10 = svadd_x(pg1, result_10, UChi_10); \
+    result_31 = svcadd_x(pg1, result_31, UChi_11, 270);   \
+    result_11 = svadd_x(pg1, result_11, UChi_11); \
+    result_32 = svcadd_x(pg1, result_32, UChi_12, 270);   \
+    result_12 = svadd_x(pg1, result_12, UChi_12); 
+
+// TP_RECON_ACCUM
+#define TP_RECON_ACCUM_A64FXd  \
+    result_00 = svadd_x(pg1, result_00, UChi_00); \
+    result_20 = svadd_x(pg1, result_20, UChi_00); \
+    result_01 = svadd_x(pg1, result_01, UChi_01); \
+    result_21 = svadd_x(pg1, result_21, UChi_01); \
+    result_02 = svadd_x(pg1, result_02, UChi_02); \
+    result_22 = svadd_x(pg1, result_22, UChi_02); \
+    result_10 = svadd_x(pg1, result_10, UChi_10); \
+    result_30 = svadd_x(pg1, result_30, UChi_10); \
+    result_11 = svadd_x(pg1, result_11, UChi_11); \
+    result_31 = svadd_x(pg1, result_31, UChi_11); \
+    result_12 = svadd_x(pg1, result_12, UChi_12); \
+    result_32 = svadd_x(pg1, result_32, UChi_12); 
+
+// TM_RECON_ACCUM
+#define TM_RECON_ACCUM_A64FXd  \
+    result_00 = svadd_x(pg1, result_00, UChi_00); \
+    result_20 = svsub_x(pg1, result_20, UChi_00); \
+    result_01 = svadd_x(pg1, result_01, UChi_01); \
+    result_21 = svsub_x(pg1, result_21, UChi_01); \
+    result_02 = svadd_x(pg1, result_02, UChi_02); \
+    result_22 = svsub_x(pg1, result_22, UChi_02); \
+    result_10 = svadd_x(pg1, result_10, UChi_10); \
+    result_30 = svsub_x(pg1, result_30, UChi_10); \
+    result_11 = svadd_x(pg1, result_11, UChi_11); \
+    result_31 = svsub_x(pg1, result_31, UChi_11); \
+    result_12 = svadd_x(pg1, result_12, UChi_12); \
+    result_32 = svsub_x(pg1, result_32, UChi_12); 
+
+// ZERO_PSI
+#define ZERO_PSI_A64FXd  \
+    result_00 = __svzero(result_00); \
+    result_01 = __svzero(result_01); \
+    result_02 = __svzero(result_02); \
+    result_10 = __svzero(result_10); \
+    result_11 = __svzero(result_11); \
+    result_12 = __svzero(result_12); \
+    result_20 = __svzero(result_20); \
+    result_21 = __svzero(result_21); \
+    result_22 = __svzero(result_22); \
+    result_30 = __svzero(result_30); \
+    result_31 = __svzero(result_31); \
+    result_32 = __svzero(result_32); 
+
--- a/Grid/simd/Fujitsu_A64FX_undef.h
+++ b/Grid/simd/Fujitsu_A64FX_undef.h
@@ -0,0 +1,68 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: Fujitsu_A64FX_undef.h
+
+    Copyright (C) 2020
+
+Author: Nils Meyer <nils.meyer@ur.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#undef LOAD_CHIMU_A64FXd
+#undef LOAD_CHIMU_A64FXf
+#undef PREFETCH_CHIMU_L1
+#undef PREFETCH_GAUGE_L1
+#undef PREFETCH_CHIMU_L2
+#undef PREFETCH_GAUGE_L2
+#undef PF_GAUGE
+#undef PREFETCH1_CHIMU
+#undef PREFETCH_CHIMU
+#undef LOCK_GAUGE
+#undef UNLOCK_GAUGE
+#undef MASK_REGS
+#undef COMPLEX_SIGNS
+#undef LOAD64
+#undef SAVE_RESULT
+#undef MULT_2SPIN_DIR_PF
+#undef MAYBEPERM
+#undef LOAD_CHI
+#undef ZERO_PSI
+#undef XP_PROJMEM
+#undef YP_PROJMEM
+#undef ZP_PROJMEM
+#undef TP_PROJMEM
+#undef XM_PROJMEM
+#undef YM_PROJMEM
+#undef ZM_PROJMEM
+#undef TM_PROJMEM
+#undef XP_RECON
+#undef XM_RECON
+#undef YM_RECON_ACCUM
+#undef ZM_RECON_ACCUM
+#undef TM_RECON_ACCUM
+#undef XP_RECON_ACCUM
+#undef YP_RECON_ACCUM
+#undef ZP_RECON_ACCUM
+#undef TP_RECON_ACCUM
+#undef PERMUTE_DIR0
+#undef PERMUTE_DIR1
+#undef PERMUTE_DIR2
+#undef PERMUTE_DIR3