now with pf, best results so far using intrinsics+pf

2026-03-01 10:06:12 +00:00 · 2020-04-12 22:06:14 +02:00
parent 113f277b6a
commit 581392f2f2
7 changed files with 414 additions and 751 deletions
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
@@ -37,12 +37,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 // undefine everything
 #include <simd/Fujitsu_A64FX_undef.h>

+#define WILSONKERNELSASMBODYA64FX
+#pragma message("invoking A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")
+
    ///////////////////////////////////////////////////////////
    // If we are A64FX specialise the single precision routine
    ///////////////////////////////////////////////////////////
 #if defined(DSLASHINTRIN)
+#pragma message ("invoking A64FX Dslash: intrin")
 #include <simd/Fujitsu_A64FX_intrin_single.h>
 #else
+#pragma message ("invoking A64FX Dslash: asm")
 #include <simd/Fujitsu_A64FX_asm_single.h>
 #endif

@@ -59,12 +64,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
@@ -74,7 +87,11 @@ WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
@@ -82,23 +99,38 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-
+#endif

 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
@@ -106,22 +138,39 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldV
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif
+

 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, dag Kernel, single
@@ -133,22 +182,38 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
@@ -156,22 +221,38 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldV
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
@@ -179,22 +260,38 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFie
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+#if defined (WILSONKERNELSASMBODYA64FX)
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
+#else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#endif

 //#undef MAYBEPERM
 //#undef MULT_2SPIN
@@ -348,7 +445,7 @@ WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, Double
 //#undef MAYBEPERM
 //#undef MULT_2SPIN

-// undefine 
+// undefine
 #include <simd/Fujitsu_A64FX_undef.h>

 ///////////////////////////////////////////////////////////
@@ -361,7 +458,7 @@ WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, Double
 #include <simd/Fujitsu_A64FX_asm_double.h>
 #endif

-// former KNL 
+// former KNL
 //#define MAYBEPERM(A,perm) if (perm) { A ; }
 //#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 //#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
@@ -654,6 +751,7 @@ WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, Double
 #endif  // VEC 5D

 // undefs
+#undef WILSONKERNELSASMBODYA64FX
 #include <simd/Fujitsu_A64FX_undef.h>

 #endif //A64FX
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h
@@ -0,0 +1,225 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: WilsonKernelsAsmBodyA64FX.h
+
+    Copyright (C) 2020
+
+Author: Nils Meyer <nils.meyer@ur.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifdef KERNEL_DAG
+#define DIR0_PROJMEM(base) XP_PROJMEM(base);
+#define DIR1_PROJMEM(base) YP_PROJMEM(base);
+#define DIR2_PROJMEM(base) ZP_PROJMEM(base);
+#define DIR3_PROJMEM(base) TP_PROJMEM(base);
+#define DIR4_PROJMEM(base) XM_PROJMEM(base);
+#define DIR5_PROJMEM(base) YM_PROJMEM(base);
+#define DIR6_PROJMEM(base) ZM_PROJMEM(base);
+#define DIR7_PROJMEM(base) TM_PROJMEM(base);
+#define DIR0_RECON   XP_RECON
+#define DIR1_RECON   YP_RECON_ACCUM
+#define DIR2_RECON   ZP_RECON_ACCUM
+#define DIR3_RECON   TP_RECON_ACCUM
+#define DIR4_RECON   XM_RECON_ACCUM
+#define DIR5_RECON   YM_RECON_ACCUM
+#define DIR6_RECON   ZM_RECON_ACCUM
+#define DIR7_RECON   TM_RECON_ACCUM
+#else
+#define DIR0_PROJMEM(base) XM_PROJMEM(base);
+#define DIR1_PROJMEM(base) YM_PROJMEM(base);
+#define DIR2_PROJMEM(base) ZM_PROJMEM(base);
+#define DIR3_PROJMEM(base) TM_PROJMEM(base);
+#define DIR4_PROJMEM(base) XP_PROJMEM(base);
+#define DIR5_PROJMEM(base) YP_PROJMEM(base);
+#define DIR6_PROJMEM(base) ZP_PROJMEM(base);
+#define DIR7_PROJMEM(base) TP_PROJMEM(base);
+#define DIR0_RECON   XM_RECON
+#define DIR1_RECON   YM_RECON_ACCUM
+#define DIR2_RECON   ZM_RECON_ACCUM
+#define DIR3_RECON   TM_RECON_ACCUM
+#define DIR4_RECON   XP_RECON_ACCUM
+#define DIR5_RECON   YP_RECON_ACCUM
+#define DIR6_RECON   ZP_RECON_ACCUM
+#define DIR7_RECON   TP_RECON_ACCUM
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Comms then compute kernel
+////////////////////////////////////////////////////////////////////////////////
+#ifdef INTERIOR_AND_EXTERIOR
+
+#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+      basep = st.GetPFInfo(nent,plocal); nent++;			\
+      if ( local ) {							\
+	LOAD64(%r10,isigns);						\
+	PROJ(base);							\
+	MAYBEPERM(PERMUTE_DIR,perm);					\
+      } else {								\
+	LOAD_CHI(base);							\
+      }									\
+      MULT_2SPIN_DIR_PF(Dir,basep);					\
+      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
+      PREFETCH_CHIMU(base);						\
+      LOAD64(%r10,isigns);						\
+      RECON;								\
+
+#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
+  PF_GAUGE(Xp);								\
+  PREFETCH1_CHIMU(base);						\
+  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
+
+#define RESULT(base,basep) SAVE_RESULT(base,basep);
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Pre comms kernel -- prefetch like normal because it is mostly right
+////////////////////////////////////////////////////////////////////////////////
+#ifdef INTERIOR
+
+#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+      basep = st.GetPFInfo(nent,plocal); nent++;			\
+      if ( local ) {							\
+	LOAD64(%r10,isigns);						\
+	PROJ(base);							\
+	MAYBEPERM(PERMUTE_DIR,perm);					\
+      }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}			\
+    base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
+      if ( local || st.same_node[Dir] ) {				\
+	MULT_2SPIN_DIR_PF(Dir,basep);					\
+    PREFETCH_CHIMU(base);						\
+	LOAD64(%r10,isigns);						\
+	RECON;								\
+      } else { PREFETCH_CHIMU(base); }						
+
+#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
+  PF_GAUGE(Xp);								\
+  PREFETCH1_CHIMU(base);						\
+  { ZERO_PSI; }								\
+  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
+
+#define RESULT(base,basep) SAVE_RESULT(base,basep);
+
+#endif
+////////////////////////////////////////////////////////////////////////////////
+// Post comms kernel
+////////////////////////////////////////////////////////////////////////////////
+#ifdef EXTERIOR
+
+
+#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
+  if((!local)&&(!st.same_node[Dir]) ) {					\
+    LOAD_CHI(base);							\
+    MULT_2SPIN_DIR_PF(Dir,base);					\
+    LOAD64(%r10,isigns);						\
+    RECON;								\
+    nmu++;								\
+  }
+
+#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+  nmu=0;								\
+  { ZERO_PSI;}								\
+  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
+  if((!local)&&(!st.same_node[Dir]) ) {					\
+    LOAD_CHI(base);							\
+    MULT_2SPIN_DIR_PF(Dir,base);					\
+    LOAD64(%r10,isigns);						\
+    RECON;								\
+    nmu++;								\
+  }
+
+#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
+
+#endif
+{
+  int nmu;
+  int local,perm, ptype;
+  uint64_t base;
+  uint64_t basep;
+  const uint64_t plocal =(uint64_t) & in[0];
+
+  COMPLEX_SIGNS(isigns);
+  MASK_REGS;
+  int nmax=U.oSites();
+  for(int site=0;site<Ns;site++) {
+#ifndef EXTERIOR
+    //    int sU =lo.Reorder(ssU);
+    int sU =ssU;
+    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
+    //    int sUn=lo.Reorder(ssn);
+    int sUn=ssn;
+    LOCK_GAUGE(0);
+#else
+    int sU =ssU;
+    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
+    int sUn=ssn;
+#endif
+    for(int s=0;s<Ls;s++) {
+      ss =sU*Ls+s;
+      ssn=sUn*Ls+s;
+      int  ent=ss*8;// 2*Ndim
+      int nent=ssn*8;
+
+   ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON);
+      ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON);
+      ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON);
+      ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON);
+
+      ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON);
+      ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON);
+      ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON);
+      ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON);
+
+#ifdef EXTERIOR
+      if (nmu==0) break;
+      //      if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
+#endif
+      base = (uint64_t) &out[ss];
+      basep= st.GetPFInfo(nent,plocal); nent++;
+      RESULT(base,basep);
+    }
+    ssU++;
+    UNLOCK_GAUGE(0);
+  }
+}
+
+#undef DIR0_PROJMEM
+#undef DIR1_PROJMEM
+#undef DIR2_PROJMEM
+#undef DIR3_PROJMEM
+#undef DIR4_PROJMEM
+#undef DIR5_PROJMEM
+#undef DIR6_PROJMEM
+#undef DIR7_PROJMEM
+#undef DIR0_RECON
+#undef DIR1_RECON
+#undef DIR2_RECON
+#undef DIR3_RECON
+#undef DIR4_RECON
+#undef DIR5_RECON
+#undef DIR6_RECON
+#undef DIR7_RECON
+#undef ASM_LEG
+#undef ASM_LEG_XP
+#undef RESULT
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.orig
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.orig
@@ -1,684 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-
-#pragma once
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-
-#undef LOAD_CHIMU  
-#undef LOAD_CHI 
-#undef MULT_2SPIN
-#undef PERMUTE_DIR
-#undef XP_PROJ  
-#undef YP_PROJ  
-#undef ZP_PROJ  
-#undef TP_PROJ  
-#undef XM_PROJ  
-#undef YM_PROJ  
-#undef ZM_PROJ  
-#undef TM_PROJ  
-#undef XP_RECON 
-#undef XP_RECON_ACCUM 
-#undef XM_RECON 
-#undef XM_RECON_ACCUM 
-#undef YP_RECON_ACCUM 
-#undef YM_RECON_ACCUM 
-#undef ZP_RECON_ACCUM 
-#undef ZM_RECON_ACCUM 
-#undef TP_RECON_ACCUM 
-#undef TM_RECON_ACCUM 
-#undef ZERO_RESULT				 
-#undef Chimu_00
-#undef Chimu_01
-#undef Chimu_02
-#undef Chimu_10
-#undef Chimu_11
-#undef Chimu_12
-#undef Chimu_20
-#undef Chimu_21
-#undef Chimu_22
-#undef Chimu_30
-#undef Chimu_31
-#undef Chimu_32
-#undef HAND_STENCIL_LEG
-#undef HAND_STENCIL_LEG_INT
-#undef HAND_STENCIL_LEG_EXT
-#undef HAND_RESULT
-#undef HAND_RESULT_INT
-#undef HAND_RESULT_EXT
-
-#define REGISTER
-
-#define LOAD_CHIMU \
-  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=ref()(0)(0);\
-    Chimu_01=ref()(0)(1);\
-    Chimu_02=ref()(0)(2);\
-    Chimu_10=ref()(1)(0);\
-    Chimu_11=ref()(1)(1);\
-    Chimu_12=ref()(1)(2);\
-    Chimu_20=ref()(2)(0);\
-    Chimu_21=ref()(2)(1);\
-    Chimu_22=ref()(2)(2);\
-    Chimu_30=ref()(3)(0);\
-    Chimu_31=ref()(3)(1);\
-    Chimu_32=ref()(3)(2);}
-
-#define LOAD_CHI\
-  {const SiteHalfSpinor &ref(buf[offset]);	\
-    Chi_00 = ref()(0)(0);\
-    Chi_01 = ref()(0)(1);\
-    Chi_02 = ref()(0)(2);\
-    Chi_10 = ref()(1)(0);\
-    Chi_11 = ref()(1)(1);\
-    Chi_12 = ref()(1)(2);}
-
-// To splat or not to splat depends on the implementation
-#define MULT_2SPIN(A)\
-  {auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));	\
-   Impl::loadLinkElement(U_10,ref()(1,0));	\
-   Impl::loadLinkElement(U_20,ref()(2,0));	\
-   Impl::loadLinkElement(U_01,ref()(0,1));	\
-   Impl::loadLinkElement(U_11,ref()(1,1));	\
-   Impl::loadLinkElement(U_21,ref()(2,1));	\
-    UChi_00 = U_00*Chi_00;\
-    UChi_10 = U_00*Chi_10;\
-    UChi_01 = U_10*Chi_00;\
-    UChi_11 = U_10*Chi_10;\
-    UChi_02 = U_20*Chi_00;\
-    UChi_12 = U_20*Chi_10;\
-    UChi_00+= U_01*Chi_01;\
-    UChi_10+= U_01*Chi_11;\
-    UChi_01+= U_11*Chi_01;\
-    UChi_11+= U_11*Chi_11;\
-    UChi_02+= U_21*Chi_01;\
-    UChi_12+= U_21*Chi_11;\
-    Impl::loadLinkElement(U_00,ref()(0,2));	\
-    Impl::loadLinkElement(U_10,ref()(1,2));	\
-    Impl::loadLinkElement(U_20,ref()(2,2));	\
-    UChi_00+= U_00*Chi_02;\
-    UChi_10+= U_00*Chi_12;\
-    UChi_01+= U_10*Chi_02;\
-    UChi_11+= U_10*Chi_12;\
-    UChi_02+= U_20*Chi_02;\
-    UChi_12+= U_20*Chi_12;}
-
-
-#define PERMUTE_DIR(dir)			\
-      permute##dir(Chi_00,Chi_00);\
-      permute##dir(Chi_01,Chi_01);\
-      permute##dir(Chi_02,Chi_02);\
-      permute##dir(Chi_10,Chi_10);\
-      permute##dir(Chi_11,Chi_11);\
-      permute##dir(Chi_12,Chi_12);
-
-//      hspin(0)=fspin(0)+timesI(fspin(3));
-//      hspin(1)=fspin(1)+timesI(fspin(2));
-#define XP_PROJ \
-    Chi_00 = Chimu_00+timesI(Chimu_30);\
-    Chi_01 = Chimu_01+timesI(Chimu_31);\
-    Chi_02 = Chimu_02+timesI(Chimu_32);\
-    Chi_10 = Chimu_10+timesI(Chimu_20);\
-    Chi_11 = Chimu_11+timesI(Chimu_21);\
-    Chi_12 = Chimu_12+timesI(Chimu_22);
-
-#define YP_PROJ \
-    Chi_00 = Chimu_00-Chimu_30;\
-    Chi_01 = Chimu_01-Chimu_31;\
-    Chi_02 = Chimu_02-Chimu_32;\
-    Chi_10 = Chimu_10+Chimu_20;\
-    Chi_11 = Chimu_11+Chimu_21;\
-    Chi_12 = Chimu_12+Chimu_22;
-
-#define ZP_PROJ \
-  Chi_00 = Chimu_00+timesI(Chimu_20);		\
-  Chi_01 = Chimu_01+timesI(Chimu_21);		\
-  Chi_02 = Chimu_02+timesI(Chimu_22);		\
-  Chi_10 = Chimu_10-timesI(Chimu_30);		\
-  Chi_11 = Chimu_11-timesI(Chimu_31);		\
-  Chi_12 = Chimu_12-timesI(Chimu_32);
-
-#define TP_PROJ \
-  Chi_00 = Chimu_00+Chimu_20;		\
-  Chi_01 = Chimu_01+Chimu_21;		\
-  Chi_02 = Chimu_02+Chimu_22;		\
-  Chi_10 = Chimu_10+Chimu_30;		\
-  Chi_11 = Chimu_11+Chimu_31;		\
-  Chi_12 = Chimu_12+Chimu_32;
-
-
-//      hspin(0)=fspin(0)-timesI(fspin(3));
-//      hspin(1)=fspin(1)-timesI(fspin(2));
-#define XM_PROJ \
-    Chi_00 = Chimu_00-timesI(Chimu_30);\
-    Chi_01 = Chimu_01-timesI(Chimu_31);\
-    Chi_02 = Chimu_02-timesI(Chimu_32);\
-    Chi_10 = Chimu_10-timesI(Chimu_20);\
-    Chi_11 = Chimu_11-timesI(Chimu_21);\
-    Chi_12 = Chimu_12-timesI(Chimu_22);
-
-#define YM_PROJ \
-    Chi_00 = Chimu_00+Chimu_30;\
-    Chi_01 = Chimu_01+Chimu_31;\
-    Chi_02 = Chimu_02+Chimu_32;\
-    Chi_10 = Chimu_10-Chimu_20;\
-    Chi_11 = Chimu_11-Chimu_21;\
-    Chi_12 = Chimu_12-Chimu_22;
-
-#define ZM_PROJ \
-  Chi_00 = Chimu_00-timesI(Chimu_20);		\
-  Chi_01 = Chimu_01-timesI(Chimu_21);		\
-  Chi_02 = Chimu_02-timesI(Chimu_22);		\
-  Chi_10 = Chimu_10+timesI(Chimu_30);		\
-  Chi_11 = Chimu_11+timesI(Chimu_31);		\
-  Chi_12 = Chimu_12+timesI(Chimu_32);
-
-#define TM_PROJ \
-  Chi_00 = Chimu_00-Chimu_20;		\
-  Chi_01 = Chimu_01-Chimu_21;		\
-  Chi_02 = Chimu_02-Chimu_22;		\
-  Chi_10 = Chimu_10-Chimu_30;		\
-  Chi_11 = Chimu_11-Chimu_31;		\
-  Chi_12 = Chimu_12-Chimu_32;
-
-//      fspin(0)=hspin(0);
-//      fspin(1)=hspin(1);
-//      fspin(2)=timesMinusI(hspin(1));
-//      fspin(3)=timesMinusI(hspin(0));
-#define XP_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesMinusI(UChi_10);\
-  result_21 = timesMinusI(UChi_11);\
-  result_22 = timesMinusI(UChi_12);\
-  result_30 = timesMinusI(UChi_00);\
-  result_31 = timesMinusI(UChi_01);\
-  result_32 = timesMinusI(UChi_02);
-
-#define XP_RECON_ACCUM\
-  result_00+=UChi_00;\
-  result_01+=UChi_01;\
-  result_02+=UChi_02;\
-  result_10+=UChi_10;\
-  result_11+=UChi_11;\
-  result_12+=UChi_12;\
-  result_20-=timesI(UChi_10);\
-  result_21-=timesI(UChi_11);\
-  result_22-=timesI(UChi_12);\
-  result_30-=timesI(UChi_00);\
-  result_31-=timesI(UChi_01);\
-  result_32-=timesI(UChi_02);
-
-#define XM_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesI(UChi_10);\
-  result_21 = timesI(UChi_11);\
-  result_22 = timesI(UChi_12);\
-  result_30 = timesI(UChi_00);\
-  result_31 = timesI(UChi_01);\
-  result_32 = timesI(UChi_02);
-
-#define XM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_10);\
-  result_21+= timesI(UChi_11);\
-  result_22+= timesI(UChi_12);\
-  result_30+= timesI(UChi_00);\
-  result_31+= timesI(UChi_01);\
-  result_32+= timesI(UChi_02);
-
-#define YP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_10;\
-  result_21+= UChi_11;\
-  result_22+= UChi_12;\
-  result_30-= UChi_00;\
-  result_31-= UChi_01;\
-  result_32-= UChi_02;
-
-#define YM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_10;\
-  result_21-= UChi_11;\
-  result_22-= UChi_12;\
-  result_30+= UChi_00;\
-  result_31+= UChi_01;\
-  result_32+= UChi_02;
-
-#define ZP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= timesI(UChi_00);			\
-  result_21-= timesI(UChi_01);			\
-  result_22-= timesI(UChi_02);			\
-  result_30+= timesI(UChi_10);			\
-  result_31+= timesI(UChi_11);			\
-  result_32+= timesI(UChi_12);
-
-#define ZM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_00);			\
-  result_21+= timesI(UChi_01);			\
-  result_22+= timesI(UChi_02);			\
-  result_30-= timesI(UChi_10);			\
-  result_31-= timesI(UChi_11);			\
-  result_32-= timesI(UChi_12);
-
-#define TP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_00;			\
-  result_21+= UChi_01;			\
-  result_22+= UChi_02;			\
-  result_30+= UChi_10;			\
-  result_31+= UChi_11;			\
-  result_32+= UChi_12;
-
-#define TM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_00;	\
-  result_21-= UChi_01;	\
-  result_22-= UChi_02;	\
-  result_30-= UChi_10;	\
-  result_31-= UChi_11;	\
-  result_32-= UChi_12;
-
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU;					\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else {					\
-    LOAD_CHI;					\
-  }						\
-  MULT_2SPIN(DIR);				\
-  RECON;					
-
-#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU;					\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI;					\
-  }						\
-  if (local || st.same_node[DIR] ) {		\
-    MULT_2SPIN(DIR);				\
-    RECON;					\
-  }
-
-#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI;					\
-    MULT_2SPIN(DIR);				\
-    RECON;					\
-    nmu++;					\
-  }
-
-#define HAND_RESULT(ss)				\
-  {						\
-    SiteSpinor & ref (out[ss]);		\
-    vstream(ref()(0)(0),result_00);		\
-    vstream(ref()(0)(1),result_01);		\
-    vstream(ref()(0)(2),result_02);		\
-    vstream(ref()(1)(0),result_10);		\
-    vstream(ref()(1)(1),result_11);		\
-    vstream(ref()(1)(2),result_12);		\
-    vstream(ref()(2)(0),result_20);		\
-    vstream(ref()(2)(1),result_21);		\
-    vstream(ref()(2)(2),result_22);		\
-    vstream(ref()(3)(0),result_30);		\
-    vstream(ref()(3)(1),result_31);		\
-    vstream(ref()(3)(2),result_32);		\
-  }
-
-#define HAND_RESULT_EXT(ss)			\
-  if (nmu){					\
-    SiteSpinor & ref (out[ss]);		\
-    ref()(0)(0)+=result_00;		\
-    ref()(0)(1)+=result_01;		\
-    ref()(0)(2)+=result_02;		\
-    ref()(1)(0)+=result_10;		\
-    ref()(1)(1)+=result_11;		\
-    ref()(1)(2)+=result_12;		\
-    ref()(2)(0)+=result_20;		\
-    ref()(2)(1)+=result_21;		\
-    ref()(2)(2)+=result_22;		\
-    ref()(3)(0)+=result_30;		\
-    ref()(3)(1)+=result_31;		\
-    ref()(3)(2)+=result_32;		\
-  }
-
-
-#define HAND_DECLARATIONS(a)			\
-  Simd result_00;				\
-  Simd result_01;				\
-  Simd result_02;				\
-  Simd result_10;				\
-  Simd result_11;				\
-  Simd result_12;				\
-  Simd result_20;				\
-  Simd result_21;				\
-  Simd result_22;				\
-  Simd result_30;				\
-  Simd result_31;				\
-  Simd result_32;				\
-  Simd Chi_00;					\
-  Simd Chi_01;					\
-  Simd Chi_02;					\
-  Simd Chi_10;					\
-  Simd Chi_11;					\
-  Simd Chi_12;					\
-  Simd UChi_00;					\
-  Simd UChi_01;					\
-  Simd UChi_02;					\
-  Simd UChi_10;					\
-  Simd UChi_11;					\
-  Simd UChi_12;					\
-  Simd U_00;					\
-  Simd U_10;					\
-  Simd U_20;					\
-  Simd U_01;					\
-  Simd U_11;					\
-  Simd U_21;
-
-#define ZERO_RESULT				\
-  result_00=Zero();				\
-  result_01=Zero();				\
-  result_02=Zero();				\
-  result_10=Zero();				\
-  result_11=Zero();				\
-  result_12=Zero();				\
-  result_20=Zero();				\
-  result_21=Zero();				\
-  result_22=Zero();				\
-  result_30=Zero();				\
-  result_31=Zero();				\
-  result_32=Zero();			
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
-  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
-  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT(ss);
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  
-  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
-  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT(ss);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT(ss);
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT(ss);
-}
-
-template<class Impl> void 
-WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset, ptype;
-  StencilEntry *SE;
-  int nmu=0;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT_EXT(ss);
-}
-
-template<class Impl>
-void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset, ptype;
-  int nmu=0;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT_EXT(ss);
-}
-
-////////////// Wilson ; uses this implementation /////////////////////
-
-NAMESPACE_END(Grid);
-#undef LOAD_CHIMU  
-#undef LOAD_CHI 
-#undef MULT_2SPIN
-#undef PERMUTE_DIR
-#undef XP_PROJ  
-#undef YP_PROJ  
-#undef ZP_PROJ  
-#undef TP_PROJ  
-#undef XM_PROJ  
-#undef YM_PROJ  
-#undef ZM_PROJ  
-#undef TM_PROJ  
-#undef XP_RECON 
-#undef XP_RECON_ACCUM 
-#undef XM_RECON 
-#undef XM_RECON_ACCUM 
-#undef YP_RECON_ACCUM 
-#undef YM_RECON_ACCUM 
-#undef ZP_RECON_ACCUM 
-#undef ZM_RECON_ACCUM 
-#undef TP_RECON_ACCUM 
-#undef TM_RECON_ACCUM 
-#undef ZERO_RESULT				 
-#undef Chimu_00
-#undef Chimu_01
-#undef Chimu_02
-#undef Chimu_10
-#undef Chimu_11
-#undef Chimu_12
-#undef Chimu_20
-#undef Chimu_21
-#undef Chimu_22
-#undef Chimu_30
-#undef Chimu_31
-#undef Chimu_32
-#undef HAND_STENCIL_LEG
-#undef HAND_STENCIL_LEG_INT
-#undef HAND_STENCIL_LEG_EXT
-#undef HAND_RESULT
-#undef HAND_RESULT_INT
-#undef HAND_RESULT_EXT
--- a/Grid/simd/Fujitsu_A64FX_asm_double.h
+++ b/Grid/simd/Fujitsu_A64FX_asm_double.h
@@ -26,20 +26,20 @@ Author: Nils Meyer <nils.meyer@ur.de>
 *************************************************************************************/
 /*  END LEGAL */
 #define LOAD_CHIMU_A64FXd(x)           LOAD_CHIMU_INTERLEAVED_A64FXd(x)  
-#define PREFETCH_CHIMU_L1(A)  
-#define PREFETCH_GAUGE_L1(A)  
-#define PREFETCH_CHIMU_L2(A)  
-#define PREFETCH_GAUGE_L2(A)  
+#define PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)  
+#define PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  
+#define PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)  
+#define PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  
 #define PF_GAUGE(A)  
-#define PREFETCH1_CHIMU(A)  
-#define PREFETCH_CHIMU(A)  
+#define PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)  
+#define PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)  
 #define LOCK_GAUGE(A)  
 #define UNLOCK_GAUGE(A)  
 #define MASK_REGS                      DECLARATIONS_A64FXd  
 #define COMPLEX_SIGNS(A)  
 #define LOAD64(A,B)  
 #define SAVE_RESULT(A,B)               RESULT_A64FXd(A)  
-#define MULT_2SPIN_DIR_PF(A,B)         MULT_2SPIN_A64FXd(A)  
+#define MULT_2SPIN_DIR_PF(A,B)         PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXd(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); }  
 #define MAYBEPERM(A,perm)              { A ; }  
 #define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)  
 #define ZERO_PSI  
@@ -105,9 +105,9 @@ asm ( \
 #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \
 { \
 asm ( \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
    :  \
    : [fetchptr] "r" (base) \
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -117,9 +117,9 @@ asm ( \
 #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \
 { \
 asm ( \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
    :  \
    : [fetchptr] "r" (base) \
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -128,24 +128,30 @@ asm ( \
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
 { \
-    const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
+    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
 asm ( \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
    :  \
    : [fetchptr] "r" (baseU) \
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 ); \
 }
 // PREFETCH_GAUGE_L1 (prefetch to L1)
-#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd  \
+#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \
 { \
-    const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
+    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 asm ( \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
    :  \
    : [fetchptr] "r" (baseU) \
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
--- a/Grid/simd/Fujitsu_A64FX_asm_single.h
+++ b/Grid/simd/Fujitsu_A64FX_asm_single.h
@@ -26,20 +26,20 @@ Author: Nils Meyer <nils.meyer@ur.de>
 *************************************************************************************/
 /*  END LEGAL */
 #define LOAD_CHIMU_A64FXf(x)           LOAD_CHIMU_INTERLEAVED_A64FXf(x)  
-#define PREFETCH_CHIMU_L1(A)  
-#define PREFETCH_GAUGE_L1(A)  
-#define PREFETCH_CHIMU_L2(A)  
-#define PREFETCH_GAUGE_L2(A)  
+#define PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)  
+#define PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  
+#define PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)  
+#define PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  
 #define PF_GAUGE(A)  
-#define PREFETCH1_CHIMU(A)  
-#define PREFETCH_CHIMU(A)  
+#define PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)  
+#define PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)  
 #define LOCK_GAUGE(A)  
 #define UNLOCK_GAUGE(A)  
 #define MASK_REGS                      DECLARATIONS_A64FXf  
 #define COMPLEX_SIGNS(A)  
 #define LOAD64(A,B)  
 #define SAVE_RESULT(A,B)               RESULT_A64FXf(A)  
-#define MULT_2SPIN_DIR_PF(A,B)         MULT_2SPIN_A64FXf(A)  
+#define MULT_2SPIN_DIR_PF(A,B)         PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXf(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); }  
 #define MAYBEPERM(A,perm)              { A ; }  
 #define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)  
 #define ZERO_PSI  
@@ -105,9 +105,9 @@ asm ( \
 #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base)  \
 { \
 asm ( \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
    :  \
    : [fetchptr] "r" (base) \
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -117,9 +117,9 @@ asm ( \
 #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base)  \
 { \
 asm ( \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
    :  \
    : [fetchptr] "r" (base) \
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
@@ -128,24 +128,30 @@ asm ( \
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \
 { \
-    const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
+    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
 asm ( \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
+    "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
    :  \
    : [fetchptr] "r" (baseU) \
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
 ); \
 }
 // PREFETCH_GAUGE_L1 (prefetch to L1)
-#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf  \
+#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \
 { \
-    const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
+    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
 asm ( \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 0, MUL VL] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 4, MUL VL] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 8, MUL VL] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
+    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
    :  \
    : [fetchptr] "r" (baseU) \
    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
--- a/Grid/simd/Fujitsu_A64FX_intrin_double.h
+++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h
@@ -26,20 +26,20 @@ Author: Nils Meyer <nils.meyer@ur.de>
 *************************************************************************************/
 /*  END LEGAL */
 #define LOAD_CHIMU_A64FXd(x)           LOAD_CHIMU_INTERLEAVED_A64FXd(x)  
-#define PREFETCH_CHIMU_L1(A)  
-#define PREFETCH_GAUGE_L1(A)  
-#define PREFETCH_CHIMU_L2(A)  
-#define PREFETCH_GAUGE_L2(A)  
+#define PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)  
+#define PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  
+#define PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)  
+#define PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  
 #define PF_GAUGE(A)  
-#define PREFETCH1_CHIMU(A)  
-#define PREFETCH_CHIMU(A)  
+#define PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)  
+#define PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)  
 #define LOCK_GAUGE(A)  
 #define UNLOCK_GAUGE(A)  
 #define MASK_REGS                      DECLARATIONS_A64FXd  
 #define COMPLEX_SIGNS(A)  
 #define LOAD64(A,B)  
 #define SAVE_RESULT(A,B)               RESULT_A64FXd(A)  
-#define MULT_2SPIN_DIR_PF(A,B)         MULT_2SPIN_A64FXd(A)  
+#define MULT_2SPIN_DIR_PF(A,B)         PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXd(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); }  
 #define MAYBEPERM(A,perm)              { A ; }  
 #define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)  
 #define ZERO_PSI  
@@ -154,15 +154,21 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
 { \
-    const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
+    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
+    svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
 }
 // PREFETCH_GAUGE_L1 (prefetch to L1)
-#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXd  \
+#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \
 { \
-    const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
+    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
--- a/Grid/simd/Fujitsu_A64FX_intrin_single.h
+++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h
@@ -26,20 +26,20 @@ Author: Nils Meyer <nils.meyer@ur.de>
 *************************************************************************************/
 /*  END LEGAL */
 #define LOAD_CHIMU_A64FXf(x)           LOAD_CHIMU_INTERLEAVED_A64FXf(x)  
-#define PREFETCH_CHIMU_L1(A)  
-#define PREFETCH_GAUGE_L1(A)  
-#define PREFETCH_CHIMU_L2(A)  
-#define PREFETCH_GAUGE_L2(A)  
+#define PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)  
+#define PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  
+#define PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)  
+#define PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  
 #define PF_GAUGE(A)  
-#define PREFETCH1_CHIMU(A)  
-#define PREFETCH_CHIMU(A)  
+#define PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)  
+#define PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)  
 #define LOCK_GAUGE(A)  
 #define UNLOCK_GAUGE(A)  
 #define MASK_REGS                      DECLARATIONS_A64FXf  
 #define COMPLEX_SIGNS(A)  
 #define LOAD64(A,B)  
 #define SAVE_RESULT(A,B)               RESULT_A64FXf(A)  
-#define MULT_2SPIN_DIR_PF(A,B)         MULT_2SPIN_A64FXf(A)  
+#define MULT_2SPIN_DIR_PF(A,B)         PREFETCH_GAUGE_L1(A); PREFETCH_CHIMU_L2(B); MULT_2SPIN_A64FXf(A); if ((A == 0) || (A == 4)) { PREFETCH_GAUGE_L2(A); }  
 #define MAYBEPERM(A,perm)              { A ; }  
 #define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)  
 #define ZERO_PSI  
@@ -154,15 +154,21 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \
 { \
-    const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
+    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
+    svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
+    svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
 }
 // PREFETCH_GAUGE_L1 (prefetch to L1)
-#define PREFETCH_GAUGE_L1_INTERNAL(A)_A64FXf  \
+#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \
 { \
-    const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref[0][0]; \
+    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \