From 873519e96046acfd0844a7d07d540d989a7a6204 Mon Sep 17 00:00:00 2001 From: Michael Marshall <43034299+mmphys@users.noreply.github.com> Date: Mon, 14 Dec 2020 16:06:10 +0000 Subject: [PATCH 01/16] Enable existing conserved current code for CUDA (compiles OK for CUDA 10.1). Add option to Test_cayley_mres to load a configuration --- .../implementation/CayleyFermion5DImplementation.h | 4 ++-- tests/debug/Test_cayley_mres.cc | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index b3fbe096..f11e9c44 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -642,7 +642,7 @@ void CayleyFermion5D::ContractConservedCurrent( PropagatorField &q_in_1, Current curr_type, unsigned int mu) { -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +#if (!defined(GRID_HIP)) Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, @@ -826,7 +826,7 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, } #endif -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +#if (!defined(GRID_HIP)) int tshift = (mu == Nd-1) ? 1 : 0; //////////////////////////////////////////////// // GENERAL CAYLEY CASE diff --git a/tests/debug/Test_cayley_mres.cc b/tests/debug/Test_cayley_mres.cc index 2e56fa81..5282c756 100644 --- a/tests/debug/Test_cayley_mres.cc +++ b/tests/debug/Test_cayley_mres.cc @@ -108,8 +108,18 @@ int main (int argc, char ** argv) GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); LatticeGaugeField Umu(UGrid); - SU::ColdConfiguration(Umu); - // SU::HotConfiguration(RNG4,Umu); + if( argc > 1 && argv[1][0] != '-' ) + { + std::cout<::ColdConfiguration(Umu); + // SU::HotConfiguration(RNG4,Umu); + } RealD mass=0.3; RealD M5 =1.0; From 4dd9e39e0d465e7cad3aef001dc0edf5e65b0ea6 Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 00:54:31 +0100 Subject: [PATCH 02/16] up to +36% performance gain for dslash/dwf on QPACE 4 using GCC 10.1.1 --- .../implementation/WilsonKernelsAsmA64FX.h | 268 +- .../WilsonKernelsAsmBodyA64FX.h | 105 +- Grid/simd/Fujitsu_A64FX_asm_double.h | 148 +- Grid/simd/Fujitsu_A64FX_asm_single.h | 148 +- Grid/simd/Fujitsu_A64FX_intrin_double.h | 160 +- Grid/simd/Fujitsu_A64FX_intrin_single.h | 160 +- Grid/simd/Fujitsu_A64FX_undef.h | 1 + Grid/simd/gridverter.py | 2377 ----------------- 8 files changed, 447 insertions(+), 2920 deletions(-) delete mode 100755 Grid/simd/gridverter.py diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 2e587dfa..ffec05a0 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -38,9 +38,6 @@ Author: Nils Meyer Regensburg University // undefine everything related to kernels #include -// enable A64FX body -#define WILSONKERNELSASMBODYA64FX -//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") /////////////////////////////////////////////////////////// // If we are A64FX specialise the single precision routine @@ -63,119 +60,89 @@ Author: Nils Meyer Regensburg University #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + ///////////////////////////////////////////////////////////////// @@ -185,119 +152,89 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + // undefine @@ -330,119 +267,89 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, double @@ -451,124 +358,93 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + // undefs -#undef WILSONKERNELSASMBODYA64FX #include #endif //A64FXASM diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 406e5c25..83588a7d 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -25,6 +25,11 @@ Author: Nils Meyer Regensburg University See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ + +// GCC 10 messes up SVE instruction scheduling using -O3 only, +// using -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders +// performance is better than armclang 20.2 + #ifdef KERNEL_DAG #define DIR0_PROJ XP_PROJ #define DIR1_PROJ YP_PROJ @@ -97,7 +102,7 @@ Author: Nils Meyer Regensburg University PROJ; \ MAYBEPERM(PERMUTE_DIR,perm); \ } else { \ - LOAD_CHI(base); \ + LOAD_CHI(base); \ } \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ MULT_2SPIN_1(Dir); \ @@ -110,6 +115,15 @@ Author: Nils Meyer Regensburg University } \ RECON; \ +/* +NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty + though I expected that it would improve on performance + + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ +*/ + #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PREFETCH1_CHIMU(base); \ @@ -126,73 +140,63 @@ Author: Nils Meyer Regensburg University #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ basep = st.GetPFInfo(nent,plocal); nent++; \ - if ( local ) { \ - LOAD_CHIMU(base); \ - LOAD_TABLE(PERMUTE_DIR); \ - PROJ; \ - MAYBEPERM(PERMUTE_DIR,perm); \ - }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ - base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ - if ( local || st.same_node[Dir] ) { \ - MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ - MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ - RECON; \ - PREFETCH_CHIMU_L2(basep); \ - } else { PREFETCH_CHIMU(base); } \ + if ( local ) { \ + LOAD_CHIMU(base); \ + LOAD_TABLE(PERMUTE_DIR); \ + PROJ; \ + MAYBEPERM(PERMUTE_DIR,perm); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_1(Dir); \ + MULT_2SPIN_2; \ + RECON; \ + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ + PREFETCH_CHIMU_L2(basep); \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PREFETCH1_CHIMU(base); \ + { ZERO_PSI; } \ ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); #endif + //////////////////////////////////////////////////////////////////////////////// // Post comms kernel //////////////////////////////////////////////////////////////////////////////// #ifdef EXTERIOR - #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - if((!local)&&(!st.same_node[Dir]) ) { \ - LOAD_CHI(base); \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ - RECON; \ - nmu++; \ + RECON; \ + nmu++; \ } -#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - nmu=0; \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\ - if((!local)&&(!st.same_node[Dir]) ) { \ - LOAD_CHI(base); \ +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + { ZERO_PSI;} \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ - RECON; \ - nmu++; \ + RECON; \ + nmu++; \ } #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} #endif + + { int nmu; int local,perm, ptype; @@ -209,7 +213,6 @@ Author: Nils Meyer Regensburg University int ssn=ssU+1; if(ssn>=nmax) ssn=0; // int sUn=lo.Reorder(ssn); int sUn=ssn; - LOCK_GAUGE(0); #else int sU =ssU; int ssn=ssU+1; if(ssn>=nmax) ssn=0; @@ -295,6 +298,11 @@ Author: Nils Meyer Regensburg University std::cout << "----------------------------------------------------" << std::endl; #endif + // DC ZVA test + // { uint64_t basestore = (uint64_t)&out[ss]; + // PREFETCH_RESULT_L2_STORE(basestore); } + + ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); #ifdef SHOW @@ -308,6 +316,11 @@ Author: Nils Meyer Regensburg University std::cout << "----------------------------------------------------" << std::endl; #endif + // DC ZVA test + //{ uint64_t basestore = (uint64_t)&out[ss]; + // PREFETCH_RESULT_L2_STORE(basestore); } + + ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON); #ifdef SHOW @@ -321,6 +334,11 @@ Author: Nils Meyer Regensburg University std::cout << "----------------------------------------------------" << std::endl; #endif + // DC ZVA test + //{ uint64_t basestore = (uint64_t)&out[ss]; + // PREFETCH_RESULT_L2_STORE(basestore); + //} + ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON); #ifdef SHOW @@ -341,6 +359,7 @@ Author: Nils Meyer Regensburg University base = (uint64_t) &out[ss]; basep= st.GetPFInfo(nent,plocal); ent++; basep = (uint64_t) &out[ssn]; + //PREFETCH_RESULT_L1_STORE(base); RESULT(base,basep); #ifdef SHOW diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 76c556d7..bbc4efe7 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJ XP_PROJ_A64FXd #define YP_PROJ YP_PROJ_A64FXd @@ -70,11 +71,18 @@ Author: Nils Meyer #define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } // DECLARATIONS #define DECLARATIONS_A64FXd \ + uint64_t baseU; \ const uint64_t lut[4][8] = { \ {4, 5, 6, 7, 0, 1, 2, 3}, \ {2, 3, 0, 1, 6, 7, 4, 5}, \ {1, 0, 3, 2, 5, 4, 7, 6}, \ {0, 1, 2, 4, 5, 6, 7, 8} };\ +asm ( \ + "ptrue p5.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ asm ( \ "fmov z31.d , 0 \n\t" \ : \ @@ -130,7 +138,7 @@ asm ( \ // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ asm ( \ "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ @@ -149,7 +157,7 @@ asm ( \ // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ @@ -163,12 +171,12 @@ asm ( \ #define LOAD_CHI_A64FXd(base) \ { \ asm ( \ - "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -178,19 +186,18 @@ asm ( \ #define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ { \ asm ( \ - "ptrue p5.d \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -201,19 +208,18 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.d \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -224,19 +230,18 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.d \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -293,17 +298,16 @@ asm ( \ ); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ - "ptrue p5.d \n\t" \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -312,14 +316,14 @@ asm ( \ // MULT_2SPIN #define MULT_2SPIN_1_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ "movprfx z18.d, p5/m, z31.d \n\t" \ "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ "movprfx z21.d, p5/m, z31.d \n\t" \ @@ -338,9 +342,9 @@ asm ( \ "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ - "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -560,7 +564,6 @@ asm ( \ #define TM_PROJ_A64FXd \ { \ asm ( \ - "ptrue p5.d \n\t" \ "fsub z12.d, p5/m, z12.d, z18.d \n\t" \ "fsub z13.d, p5/m, z13.d, z19.d \n\t" \ "fsub z14.d, p5/m, z14.d, z20.d \n\t" \ @@ -715,7 +718,6 @@ asm ( \ // ZERO_PSI #define ZERO_PSI_A64FXd \ asm ( \ - "ptrue p5.d \n\t" \ "fmov z0.d , 0 \n\t" \ "fmov z1.d , 0 \n\t" \ "fmov z2.d , 0 \n\t" \ @@ -733,13 +735,13 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ { \ asm ( \ - "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "dc zva, %[fetchptr]\n\t" \ + "dc zva, %[fetchptr]\n\t" \ + "dc zva, %[fetchptr]\n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index d809f83b..e629f617 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ZERO_PSI ZERO_PSI_A64FXf #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJ XP_PROJ_A64FXf #define YP_PROJ YP_PROJ_A64FXf @@ -70,11 +71,18 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { PERMUTE; } // DECLARATIONS #define DECLARATIONS_A64FXf \ + uint64_t baseU; \ const uint32_t lut[4][16] = { \ {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ +asm ( \ + "ptrue p5.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ asm ( \ "fmov z31.s , 0 \n\t" \ : \ @@ -130,7 +138,7 @@ asm ( \ // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ asm ( \ "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ @@ -149,7 +157,7 @@ asm ( \ // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ @@ -163,12 +171,12 @@ asm ( \ #define LOAD_CHI_A64FXf(base) \ { \ asm ( \ - "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -178,19 +186,18 @@ asm ( \ #define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ { \ asm ( \ - "ptrue p5.s \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -201,19 +208,18 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.s \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -224,19 +230,18 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.s \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -293,17 +298,16 @@ asm ( \ ); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ - "ptrue p5.s \n\t" \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -312,14 +316,14 @@ asm ( \ // MULT_2SPIN #define MULT_2SPIN_1_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ "movprfx z18.s, p5/m, z31.s \n\t" \ "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ "movprfx z21.s, p5/m, z31.s \n\t" \ @@ -338,9 +342,9 @@ asm ( \ "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ - "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -560,7 +564,6 @@ asm ( \ #define TM_PROJ_A64FXf \ { \ asm ( \ - "ptrue p5.s \n\t" \ "fsub z12.s, p5/m, z12.s, z18.s \n\t" \ "fsub z13.s, p5/m, z13.s, z19.s \n\t" \ "fsub z14.s, p5/m, z14.s, z20.s \n\t" \ @@ -715,7 +718,6 @@ asm ( \ // ZERO_PSI #define ZERO_PSI_A64FXf \ asm ( \ - "ptrue p5.s \n\t" \ "fmov z0.s , 0 \n\t" \ "fmov z1.s , 0 \n\t" \ "fmov z2.s , 0 \n\t" \ @@ -733,13 +735,13 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ { \ asm ( \ - "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "dc zva, %[fetchptr]\n\t" \ + "dc zva, %[fetchptr]\n\t" \ + "dc zva, %[fetchptr]\n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 232610f2..361246fc 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJ XP_PROJ_A64FXd #define YP_PROJ YP_PROJ_A64FXd @@ -70,6 +71,7 @@ Author: Nils Meyer #define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } // DECLARATIONS #define DECLARATIONS_A64FXd \ + uint64_t baseU; \ const uint64_t lut[4][8] = { \ {4, 5, 6, 7, 0, 1, 2, 3}, \ {2, 3, 0, 1, 6, 7, 4, 5}, \ @@ -126,18 +128,18 @@ Author: Nils Meyer // RESULT #define RESULT_A64FXd(base) \ { \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \ } // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ @@ -156,7 +158,7 @@ Author: Nils Meyer // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ @@ -170,7 +172,7 @@ Author: Nils Meyer // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ @@ -178,62 +180,62 @@ Author: Nils Meyer // LOAD_CHI #define LOAD_CHI_A64FXd(base) \ { \ - Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64)); \ - Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64)); \ - Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64)); \ - Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64)); \ - Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64)); \ - Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64)); \ + Chi_00 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(0)); \ + Chi_01 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(1)); \ + Chi_02 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(2)); \ + Chi_10 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(3)); \ + Chi_11 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(4)); \ + Chi_12 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(5)); \ } // LOAD_CHIMU #define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ { \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_CHIMU_0213 #define LOAD_CHIMU_0213_A64FXd \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ + Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ } // LOAD_CHIMU_0312 #define LOAD_CHIMU_0312_A64FXd \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_TABLE0 #define LOAD_TABLE0 \ @@ -261,26 +263,26 @@ Author: Nils Meyer Chi_12 = svtbl(Chi_12, table0); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ } // MULT_2SPIN #define MULT_2SPIN_1_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ @@ -293,9 +295,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ + U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \ + U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \ + U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \ } // MULT_2SPIN_BACKEND #define MULT_2SPIN_2_A64FXd \ @@ -570,12 +572,12 @@ Author: Nils Meyer result_31 = svdup_f64(0.); \ result_32 = svdup_f64(0.); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \ } // PREFETCH_RESULT_L1_STORE (prefetch store to L1) #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 180e5f4f..30273b6e 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ZERO_PSI ZERO_PSI_A64FXf #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJ XP_PROJ_A64FXf #define YP_PROJ YP_PROJ_A64FXf @@ -70,6 +71,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { PERMUTE; } // DECLARATIONS #define DECLARATIONS_A64FXf \ + uint64_t baseU; \ const uint32_t lut[4][16] = { \ {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ @@ -126,18 +128,18 @@ Author: Nils Meyer // RESULT #define RESULT_A64FXf(base) \ { \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \ } // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ @@ -156,7 +158,7 @@ Author: Nils Meyer // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ @@ -170,7 +172,7 @@ Author: Nils Meyer // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ @@ -178,62 +180,62 @@ Author: Nils Meyer // LOAD_CHI #define LOAD_CHI_A64FXf(base) \ { \ - Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64)); \ - Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64)); \ - Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64)); \ - Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64)); \ - Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64)); \ - Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64)); \ + Chi_00 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(0)); \ + Chi_01 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(1)); \ + Chi_02 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(2)); \ + Chi_10 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(3)); \ + Chi_11 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(4)); \ + Chi_12 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(5)); \ } // LOAD_CHIMU #define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ { \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_CHIMU_0213 #define LOAD_CHIMU_0213_A64FXf \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ + Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ } // LOAD_CHIMU_0312 #define LOAD_CHIMU_0312_A64FXf \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_TABLE0 #define LOAD_TABLE0 \ @@ -261,26 +263,26 @@ Author: Nils Meyer Chi_12 = svtbl(Chi_12, table0); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ } // MULT_2SPIN #define MULT_2SPIN_1_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ @@ -293,9 +295,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ + U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \ + U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \ + U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \ } // MULT_2SPIN_BACKEND #define MULT_2SPIN_2_A64FXf \ @@ -570,12 +572,12 @@ Author: Nils Meyer result_31 = svdup_f32(0.); \ result_32 = svdup_f32(0.); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \ } // PREFETCH_RESULT_L1_STORE (prefetch store to L1) #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h index 81eec37a..51762a60 100644 --- a/Grid/simd/Fujitsu_A64FX_undef.h +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -46,6 +46,7 @@ Author: Nils Meyer #undef MULT_2SPIN_2 #undef MAYBEPERM #undef LOAD_CHI +#undef ZERO_PSI #undef XP_PROJ #undef YP_PROJ #undef ZP_PROJ diff --git a/Grid/simd/gridverter.py b/Grid/simd/gridverter.py deleted file mode 100755 index f00a5019..00000000 --- a/Grid/simd/gridverter.py +++ /dev/null @@ -1,2377 +0,0 @@ -#!/usr/bin/python3 - -import re -import argparse -import sys - -# Grid for A64FX -# -# * should align std::vector to (multiples of) cache block size = 256 bytes - -# place benchmark runtime in cycles here ! -measured_cycles = 690 #1500 #775 #1500 - - -# command line parser -parser = argparse.ArgumentParser(description="Dslash generator.") -parser.add_argument("--single", action="store_true", default="False") -parser.add_argument("--double", action="store_true", default="True") -parser.add_argument("--debug", action="store_true", default="False") -parser.add_argument("--gridbench", action="store_true", default="False") -args = parser.parse_args() - -print(args) - -ASM_LOAD_CHIMU = True # load chimu -ASM_LOAD_GAUGE = True # load gauge -ASM_LOAD_TABLE = True # load table -ASM_STORE = True # store result - -# Disable all loads and stores in asm for benchmarking purposes -#DISABLE_ASM_LOAD_STORE = True -DISABLE_ASM_LOAD_STORE = False - -if DISABLE_ASM_LOAD_STORE: - ASM_LOAD_CHIMU = True # load chimu - ASM_LOAD_GAUGE = True # load gauge - ASM_LOAD_TABLE = True # load table - ASM_STORE = False # store result - -# Alternative implementation using PROJ specific loads works, -# but be careful with predication - -ALTERNATIVE_LOADS = False -#ALTERNATIVE_LOADS = not ALTERNATIVE_LOADS # True - -# Alternative register mapping, -# must use with my_wilson4.h and my_wilson4pf.h - -ALTERNATIVE_REGISTER_MAPPING = False -#ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING - -if ALTERNATIVE_REGISTER_MAPPING == True: - ALTERNATIVE_LOADS = False - -# use movprfx -MOVPRFX = False -MOVPRFX = not MOVPRFX - - -PREFETCH = False -PREFETCH = not PREFETCH # True - -PRECISION = 'double' # DP by default -PRECSUFFIX = 'A64FXd' -if args.single == True: - PRECISION = 'single' - PRECSUFFIX = 'A64FXf' - -_DEBUG = False #True # insert debugging output -if args.debug == True: - _DEBUG = True - -GRIDBENCH = False -if args.gridbench == True: - GRIDBENCH = True - -print("PRECISION = ", PRECISION) -print("DEBUG = ", _DEBUG) -print("ALTERNATIVE_LOADS = ", ALTERNATIVE_LOADS) -print("ALTERNATIVE_REGISTER_MAPPING = ", ALTERNATIVE_REGISTER_MAPPING) -print("MOVPRFX = ", MOVPRFX) -print("DISABLE_ASM_LOAD_STORE = ", DISABLE_ASM_LOAD_STORE) -print("GRIDBENCH = ", GRIDBENCH) - -print("") - -#sys.exit(0) - - -#_DEBUG = True # insert debugging output - -FETCH_BASE_PTR_COLOR_OFFSET = 2 # offset for scalar plus signed immediate addressing -STORE_BASE_PTR_COLOR_OFFSET = 2 - -# 64-bit gp register usage !!! armclang 20.0 complains about the register choice !!! -# table address: x30 -# data address: x29 -# store address: x28 -# debug address: r8 - -# Max performance of complex FMA using FCMLA instruction -# is 25% peak. -# -# Issue latency of FCMLA is 2 cycles. -# Need 2 FCMLA instructions for complex FMA. -# Complete complex FMA takes 4 cycles. -# Peak throughput is 4 * 8 Flops DP = 32 Flops DP in 4 cycles. -# A64FX FMA throughput is 4 * 8 * 2 * 2 = 132 Flops DP in 4 cycles. -# -> 25% peak FMA -# -# In: 3x 512 bits = 192 bytes -# Out: 1x 512 bits = 64 bytes -# Tot: 4x 512 bits = 256 bytes -# -# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2) - -OPT = """ -* interleave prefetching and compute in MULT_2SPIN -* could test storing U's in MULT_2SPIN to L1d for cache line update -* structure reordering: MAYBEPERM after MULT_2SPIN ? -""" - -filename = 'XXX' -LEGAL = """/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: {} - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -""" - -class Register: - - def __init__(self, variable, asmreg='X', predication=False): - global d - x = 'Y' - if predication == False: - x = asmreg # + d['asmsuffix'] - else: - x = asmreg - self.asmreg = x - self.asmregwithsuffix = asmreg + d['asmsuffix'] - self.asmregbyte = asmreg + '.b' - self.name = variable - self.asmname = variable - self.asmnamebyte = variable + '.b' - self.predication = predication - - d['registers'] += 1 - - def define(self, statement): - global d - d['C'] += F'#define {self.name} {statement}' - #d['A'] += F'#define {self.name} {statement}' - - def declare(self, predication=False): - global d - - if self.predication == False: - d['C'] += F' Simd {self.name}; \\\n' - - predtype = 'svfloat64_t' - if PRECISION == 'single': - predtype = 'svfloat32_t' - - d['I'] += F' {predtype} {self.name}; \\\n' - else: - d['I'] += F' svbool_t {self.name}; \\\n' - #d['A'] += F'#define {self.name} {self.asmreg} \n' - - def loadpredication(self, target='A'): - global d - if (target == 'A'): - d['A'] += F' "ptrue {self.asmregwithsuffix} \\n\\t" \\\n' - d['asmclobber'].append(F'"{self.asmreg}"') - - def loadtable(self, t): - global d - d['load'] += d['factor'] - gpr = d['asmtableptr'] - - cast = 'uint64_t' - #asm_opcode = 'ld1d' - #if PRECISION == 'single': - # asm_opcode = 'ld1w' - # cast = 'uint32_t' - asm_opcode = 'ldr' - if PRECISION == 'single': - asm_opcode = 'ldr' - cast = 'uint32_t' - - d['I'] += F' {self.name} = svld1(pg1, ({cast}*)&lut[{t}]); \\\n' - - # using immediate index break-out works - if asm_opcode == 'ldr': - # ldr version - d['A'] += F' "{asm_opcode} {self.asmreg}, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' - else: - # ld1 version - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' - - d['asminput'].append(F'[tableptr] "r" (&lut[0])') - d['asminput'].append(F'[index] "i" ({t})') - d['asmclobber'].append(F'"memory"') - d['asmclobber'].append(F'"cc"') - - def load(self, address, target='ALL', cast='float64_t', colors=3, offset=FETCH_BASE_PTR_COLOR_OFFSET): - global d - d['load'] += d['factor'] - indices = re.findall(r'\d+', address) - index = (int(indices[0]) - offset) * colors + int(indices[1]) - - #asm_opcode = 'ld1d' - #if PRECISION == 'single': - #asm_opcode = 'ld1w' - # cast = 'float32_t' - - asm_opcode = 'ldr' - if PRECISION == 'single': - asm_opcode = 'ldr' - cast = 'float32_t' - - gpr = d['asmfetchbaseptr'] - intrinfetchbase = d['intrinfetchbase'] - if (target in ['ALL', 'C']): - d['C'] += F' {self.name} = {address}; \\\n' - if (target in ['ALL', 'I']): -# d['I'] += F' {self.name} = svldnt1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' - d['I'] += F' {self.name} = svld1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' - if (target in ['ALL', 'A']): - if asm_opcode == 'ldr': - d['A'] += F' "{asm_opcode} {self.asmreg}, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' - else: - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' - - def store(self, address, cast='float64_t', colors=3, offset=STORE_BASE_PTR_COLOR_OFFSET): - global d - d['store'] += d['factor'] - indices = re.findall(r'\d+', address) - index = (int(indices[0]) - offset) * colors + int(indices[1]) - - #asm_opcode = 'stnt1d' - #if PRECISION == 'single': - # asm_opcode = 'stnt1w' - # cast = 'float32_t' - asm_opcode = 'str' - if PRECISION == 'single': - asm_opcode = 'str' - cast = 'float32_t' - - intrinstorebase = d['intrinstorebase'] - - d['C'] += F' {address} = {self.name}; \\\n' - #d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' - d['I'] += F' svst1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' - if asm_opcode == 'str': - d['A'] += F' "{asm_opcode} {self.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' - else: - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' - - def movestr(self, str): - global d - #d['move'] += d['factor'] - d['I'] += F' {self.name} = {str}; \\\n' - - def move(self, op1): - global d - d['move'] += d['factor'] - d['C'] += F' {self.name} = {op1.name}; \\\n' - d['I'] += F' {self.name} = {op1.name}; \\\n' - d['A'] += F' "mov {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - - # a = a + b , a = b + c - def add(self, op1, op2=None): - global d - d['add'] += d['factor'] - if op2 is None: - d['C'] += F' {self.name} = {self.name} + {op1.name}; \\\n' - d['I'] += F' {self.name} = svadd_x(pg1, {self.name}, {op1.name}); \\\n' - d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} + {op2.name}; \\\n' - d['I'] += F' {self.name} = svadd_x(pg1, {op1.name}, {op2.name}); \\\n' - d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' - - # a = a -b , a = b - c - def sub(self, op1, op2=None): - global d - d['sub'] += d['factor'] - if op2 is None: - d['C'] += F' {self.name} = {self.name} - {op1.name}; \\\n' - d['I'] += F' {self.name} = svsub_x(pg1, {self.name}, {op1.name}); \\\n' - d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} - {op2.name}; \\\n' - d['I'] += F' {self.name} = svsub_x(pg1, {op1.name}, {op2.name}); \\\n' - d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' - - # a = a * b , a = b * c - def mul(self, op1, op2): - global d - d['mul'] += 2 * d['factor'] - d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = __svzero({self.name}); \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "mov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def mul0(self, op1, op2, op3=None, constructive=False): - global d - d['mul'] += d['factor'] - - # no movprfx intrinsics support - if constructive == True: - d['movprfx'] += d['factor'] - d['I'] += F' {self.name} = svcmla_x(pg1, {op1.name}, {op2.name}, {op3.name}, 0); \\\n' - d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op2.asmregwithsuffix}, {op3.asmregwithsuffix}, 0 \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - - def mul1(self, op1, op2): - global d - d['mul'] += d['factor'] - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def mac(self, op1, op2): - global d - d['mac'] += 2 * d['factor'] - d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def mac0(self, op1, op2): - global d - d['mac'] += d['factor'] - d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - - def mac1(self, op1, op2): - global d - d['mac'] += d['factor'] - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def zero(self, zeroreg=False): - d['zero'] += d['factor'] - d['C'] += F' {self.name} = 0; \\\n' - #d['I'] += F' {self.name} = __svzero({self.name}); \\\n' only armclang - - if PRECISION == 'double': - d['I'] += F' {self.name} = svdup_f64(0.); \\\n' - else: - d['I'] += F' {self.name} = svdup_f32(0.); \\\n' - - if zeroreg == True: - d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' - else: - #using mov z, zero0 issue 1c, FLA, latency 6c - #d['A'] += F' "mov {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' - - #using mov z, 0 issue 1c, FLA, latency 6c - d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' - - #using xor z, z, z issue 0.5c, FL*, latency 4c - #d['A'] += F' "eor {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' - - #using and z, z, zero0 issue 0.5c, FL*, latency 4c - #d['A'] += F' "and {self.asmregwithsuffix}, {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' - - #using sub z, z, z issue 0.5c, FL*, latency 9c - #d['A'] += F' "sub {self.asmregwithsuffix}, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' - - # without table - def timesI(self, op1, tempreg=None, tablereg=None): - global d - d['timesI'] += d['factor'] - d['C'] += F' {self.name} = timesI({op1.name}); \\\n' - # correct if DEBUG enabled, wrong if DEBUG disabled; no idea what's causing this - #table.load('table2', target='I', cast='uint64_t') - #d['I'] += F' {self.name} = svtbl({op1.name}, {tablereg.name}); \\\n' - #d['I'] += F' {self.name} = svneg_x(pg2, {self.name}); \\\n' - # timesI using trn tested, works but tbl should be faster - d['I'] += F' {tempreg.name} = svtrn2({op1.name}, {op1.name}); \\\n' - d['I'] += F' {tempreg.name} = svneg_x(pg1, {tempreg.name}); \\\n' - d['I'] += F' {self.name} = svtrn1({tempreg.name}, {op1.name}); \\\n' - d['A'] += F' "trn2 {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fneg {tempreg.asmregwithsuffix}, {pg1.asmreg}/m, {tempreg.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "trn1 {self.asmregwithsuffix}, {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - - def addTimesI(self, op1, op2=None, constructive=False): - global d - d['addTimesI'] += d['factor'] - - if op2 is None: - d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' - else: - d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' - - # no movprfx intrinsics support - if constructive == True: - d['movprfx'] += d['factor'] - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - else: - if op2 is None: - d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 90); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 90 \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def subTimesI(self, op1, op2=None, constructive=False): - global d - d['subTimesI'] += d['factor'] - - # no movprfx intrinsics support - if constructive == True: - d['movprfx'] += d['factor'] - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' - d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' - else: - if op2 is None: - d['C'] += F' {self.name} = {self.name} - timesI({op1.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 270); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 270 \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} - timesI({op2.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' - - # timesMinusI is not used, def is probably wrong !!!! OPTIMIZATION with table - def timesMinusI(self, op1): - global d - d['timesMinusI'] += d['factor'] - d['C'] += F' {self.name} = timesMinusI({self.name}); \\\n' - d['I'] += F' {self.name} = svtrn1({op1.name}, {op1.name}); \\\n' - d['I'] += F' {self.name} = svneg_x(pg1, {self.name}); \\\n' - d['I'] += F' {self.name} = svtrn1({op1.name}, {self.name}); \\\n' - - def permute(self, dir, tablereg=None): - global d - d['permutes'] += d['factor'] - - d['C'] += F' permute{dir}({self.name}, {self.name}); \\\n' - - d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' - d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' - - # if dir == 0: - # d['I'] += F' {self.name} = svext({self.name}, {self.name}, 4); \\\n' - # # this might not work, see intrinsics assembly - # # d['A'] += F' ext {self.name}, {self.name}, {self.name}, #4 \\\n' - # # use registers directly - # d['A'] += F' "ext {self.asmregbyte}, {self.asmregbyte}, {self.asmregbyte}, 32 \\n\\t" \\\n' - # - # elif dir in [1, 2]: - # d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' - # d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' - - def debug(self): - global d - typecast = d['cfloat'] - gpr = d['asmdebugptr'] - vregs = d['asmclobberlist'] - if (d['debug'] == True): - d['C'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' - - d['I'] += F'svst1(pg1, ({typecast}*)&debugreg.v, {self.name}); \\\n' - d['I'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' - #d['I'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' - - d['A'] += F'asm ( \\\n' - d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier - d['A'] += F' "str {self.asmreg}, [%[ptr]] \\n\\t" \\\n' - d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier - d['A'] += F' : "=m" (debugreg.v) \\\n' - d['A'] += F' : [ptr] "r" (&debugreg.v) \\\n' - d['A'] += F' : "p5", "cc", "memory" \\\n' - d['A'] += F'); \\\n' - d['A'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' - # this form of addressing is not valid! - #d['A'] += F' "str {self.asmreg}, %[ptr] \\n\\t" \\\n' -# end Register - -def define(s, target='ALL'): - x = F'#define {s} \n' - global d - if (target in ['ALL', 'C']): - d['C'] += x - if (target in ['ALL', 'I']): - d['I'] += x - if (target in ['ALL', 'A']): - d['A'] += x - -def definemultiline(s): - x = F'#define {s} \\\n' - global d - d['C'] += x - d['I'] += x - d['A'] += x - -def write(s, target='ALL'): - x = F'{s}\n' - global d - if (target in ['ALL', 'C']): - d['C'] += x - if (target in ['ALL', 'I']): - d['I'] += x - if (target in ['ALL', 'A']): - d['A'] += x - -def curlyopen(): - write(F'{{ \\') - -def curlyclose(): - write(F'}}') - -def newline(target='ALL'): - global d - - if target == 'A': - if d['A'][-2:] == '\\\n': - d['A'] = d['A'][:-2] + '\n\n' - else: - if d['C'][-2:] == '\\\n': - d['C'] = d['C'][:-2] + '\n\n' - if d['I'][-2:] == '\\\n': - d['I'] = d['I'][:-2] + '\n\n' - if d['A'][-2:] == '\\\n': - d['A'] = d['A'][:-2] + '\n\n' - -# load the base pointer for fetches -def fetch_base_ptr(address, target='A'): - global d - #d['load'] += d['factor'] - - # DEBUG - #colors=3 - #indices = re.findall(r'\d+', address) - #index = (int(indices[0]) - FETCH_BASE_PTR_COLOR_OFFSET) * colors + int(indices[1]) - #print(F'{address} (base)') - - vregs = d['asmclobberlist'] - if target == 'A': - d['asminput'].append(F'[fetchptr] "r" ({address})') - d['asmclobber'].extend(vregs) - d['asmclobber'].append(F'"memory"') - d['asmclobber'].append(F'"cc"') - if target == 'I': - #print("intrinfetchbase = ", address) - d['intrinfetchbase'] = address - -# load the base pointer for stores -def store_base_ptr(address, target='A'): - global d - #d['load'] += d['factor'] - gpr = d['asmstorebaseptr'] - vregs = d['asmclobberlist'] - if target == 'A': - d['asminput'].append(F'[storeptr] "r" ({address})') - d['asmclobber'].extend(vregs) - d['asmclobber'].append(F'"memory"') - d['asmclobber'].append(F'"cc"') - if target == 'I': - d['intrinstorebase'] = address - -def prefetch_L1(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PLDL1STRM" # weak - #policy = "PLDL1KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - -def prefetch_L2(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PLDL2STRM" # weak - #policy = "PLDL2KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - #d['A'] += - -def prefetch_L2_store(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PSTL2STRM" # weak - #policy = "PSTL2KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - -def prefetch_L1_store(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PSTL1STRM" # weak - #policy = "PSTL2KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - - -def asmopen(): - #write('asm volatile ( \\', target='A') - write('asm ( \\', target='A') - - # DEBUG - #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier - #write('asm volatile ( \\', target='A') - -def asmclose(): - global d - - #print(d['asminput']) - - asmin = d['asminput'] - asmin_s = '' - if len(asmin) > 0: - asmin = list(dict.fromkeys(asmin)) # remove duplicates - #print(asmin) - for el in asmin: - asmin_s += el + ',' - asmin_s = asmin_s[:-1] - #print("-> ", asmin_s) - - d['asminput'] = [] - - asmout = d['asmoutput'] - asmout_s = '' - if len(asmout) > 0: - asmout = list(dict.fromkeys(asmout)) # remove duplicates - for el in asmout: - asmout_s += el + ',' - asmout_s = asmout_s[:-1] - - d['asmoutput'] = [] - - # DEBUG put all regs into clobber by default - d['asmclobber'].extend(d['asmclobberlist']) - - asmclobber = d['asmclobber'] - asmclobber_s = '' - #print(asmclobber) - if len(asmclobber) > 0: - asmclobber = list(dict.fromkeys(asmclobber)) # remove duplicates - for el in asmclobber: - asmclobber_s += el + ',' - asmclobber_s = asmclobber_s[:-1] - - d['asmclobber'] = [] - - # DEBUG - #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier - - - write(F' : {asmout_s} \\', target='A') - write(F' : {asmin_s} \\', target='A') - write(F' : {asmclobber_s} \\', target='A') - write('); \\', target='A') - -# -------------------------------------------------------------------------------- - -# string of vector registers to be used in clobber list -#clobberlist = ['"p0"'] -clobberlist = ['"p5"'] -clobberlist.append('"cc"') -for i in range(0, 32): - clobberlist.append(F'"z{i}"') - -d = { -'debug': _DEBUG, -'C': '', -'I': '', -'A': '', -'asmsuffix': '.d', # double precision by default -'cfloat': 'float64_t', -'registers': 0, -'load': 0, -'store': 0, -'move': 0, -'movprfx': 0, -'zero': 0, -'add': 0, -'sub': 0, -'mul': 0, -'mac': 0, -'permutes': 0, -'neg': 0, -'addTimesI': 0, -'subTimesI': 0, -'timesI': 0, -'timesMinusI': 0, -'flops': 0, -'factor': 1, # multiplicity -'asmtableptr': 'x30', -'asmfetchbaseptr': 'x29', -'asmstorebaseptr': 'x28', -'asmdebugptr': 'r12', -'asminput': [], -'asmoutput': [], -'asmclobber': [], -'asmclobberlist': clobberlist, -'intrinfetchbase': '', -'intrinstorebase': '', -'cycles_LOAD_CHIMU': 0, -'cycles_PROJ': 0, -'cycles_PERM': 0, -'cycles_MULT_2SPIN': 0, -'cycles_RECON': 0, -'cycles_RESULT': 0, -'cycles_ZERO_PSI': 0, -'cycles_PREFETCH_L1': 0, -'cycles_PREFETCH_L2': 0 -} - -if PRECISION == 'single': - d['asmsuffix'] = '.s' - d['cfloat'] = 'float32_t' - -# -------------------------------------------------------------------------------- -# Grid -# -------------------------------------------------------------------------------- - -# Variables / Registers -result_00 = Register('result_00', asmreg='z0') -result_01 = Register('result_01', asmreg='z1') -result_02 = Register('result_02', asmreg='z2') -result_10 = Register('result_10', asmreg='z3') -result_11 = Register('result_11', asmreg='z4') -result_12 = Register('result_12', asmreg='z5') -result_20 = Register('result_20', asmreg='z6') -result_21 = Register('result_21', asmreg='z7') -result_22 = Register('result_22', asmreg='z8') -result_30 = Register('result_30', asmreg='z9') -result_31 = Register('result_31', asmreg='z10') -result_32 = Register('result_32', asmreg='z11') # 12 Regs -Chi_00 = Register('Chi_00', asmreg='z12') -Chi_01 = Register('Chi_01', asmreg='z13') -Chi_02 = Register('Chi_02', asmreg='z14') -Chi_10 = Register('Chi_10', asmreg='z15') -Chi_11 = Register('Chi_11', asmreg='z16') -Chi_12 = Register('Chi_12', asmreg='z17') # 6 -UChi_00 = Register('UChi_00', asmreg='z18') -UChi_01 = Register('UChi_01', asmreg='z19') -UChi_02 = Register('UChi_02', asmreg='z20') -UChi_10 = Register('UChi_10', asmreg='z21') -UChi_11 = Register('UChi_11', asmreg='z22') -UChi_12 = Register('UChi_12', asmreg='z23') # 6 -U_00 = Register('U_00', asmreg='z24') -U_10 = Register('U_10', asmreg='z25') -U_20 = Register('U_20', asmreg='z26') -U_01 = Register('U_01', asmreg='z27') -U_11 = Register('U_11', asmreg='z28') -U_21 = Register('U_21', asmreg='z29') # 6 -> 30 Registers - -table0 = Register('table0', asmreg='z30') -zero0 = Register('zero0', asmreg='z31') # 2 -> 32 Registers -# can't overload temp1 / table due to type mismatch using intrinsics :( -# typecasting SVE intrinsics variables is not allowed - -pg1 = Register('pg1', predication=True, asmreg='p5') -#pg2 = Register('pg2', predication=True, asmreg='p1') - -# Overloaded with Chi_* and UChi_* -Chimu_00 = Register('Chimu_00', asmreg=Chi_00.asmreg) -Chimu_01 = Register('Chimu_01', asmreg=Chi_01.asmreg) -Chimu_02 = Register('Chimu_02', asmreg=Chi_02.asmreg) -Chimu_10 = Register('Chimu_10', asmreg=Chi_10.asmreg) -Chimu_11 = Register('Chimu_11', asmreg=Chi_11.asmreg) -Chimu_12 = Register('Chimu_12', asmreg=Chi_12.asmreg) -if ALTERNATIVE_REGISTER_MAPPING == False: - Chimu_20 = Register('Chimu_20', asmreg=UChi_00.asmreg) - Chimu_21 = Register('Chimu_21', asmreg=UChi_01.asmreg) - Chimu_22 = Register('Chimu_22', asmreg=UChi_02.asmreg) - Chimu_30 = Register('Chimu_30', asmreg=UChi_10.asmreg) - Chimu_31 = Register('Chimu_31', asmreg=UChi_11.asmreg) - Chimu_32 = Register('Chimu_32', asmreg=UChi_12.asmreg) # 12 Registers -else: # wilson4.h - Chimu_20 = Register('Chimu_20', asmreg=U_00.asmreg) - Chimu_21 = Register('Chimu_21', asmreg=U_10.asmreg) - Chimu_22 = Register('Chimu_22', asmreg=U_20.asmreg) - Chimu_30 = Register('Chimu_30', asmreg=U_01.asmreg) - Chimu_31 = Register('Chimu_31', asmreg=U_11.asmreg) - Chimu_32 = Register('Chimu_32', asmreg=U_21.asmreg) - -# debugging output -def debugall(msg=None, group='ALL'): - global d - if (d['debug'] == False): - return - write(F'std::cout << std::endl << "DEBUG -- {msg}" << std::endl; \\') - if (group in ['ALL', 'result']): - result_00.debug() - result_01.debug() - result_02.debug() - result_10.debug() - result_11.debug() - result_12.debug() - result_20.debug() - result_21.debug() - result_22.debug() - result_30.debug() - result_31.debug() - result_32.debug() - if (group in ['ALL', 'Chi']): - Chi_00.debug() - Chi_01.debug() - Chi_02.debug() - Chi_10.debug() - Chi_11.debug() - Chi_12.debug() - if (group in ['ALL', 'UChi']): - UChi_00.debug() - UChi_01.debug() - UChi_02.debug() - UChi_10.debug() - UChi_11.debug() - UChi_12.debug() - if (group in ['ALL', 'U']): - U_00.debug() - U_10.debug() - U_20.debug() - U_01.debug() - U_11.debug() - U_21.debug() - if (group in ['ALL', 'Chimu']): - Chimu_00.debug() - Chimu_01.debug() - Chimu_02.debug() - Chimu_10.debug() - Chimu_11.debug() - Chimu_12.debug() - Chimu_20.debug() - Chimu_21.debug() - Chimu_22.debug() - Chimu_30.debug() - Chimu_31.debug() - Chimu_32.debug() - -# -------------------------------------------------------------------------------- -# Output -# -------------------------------------------------------------------------------- - -if ALTERNATIVE_LOADS == True: - define(F'LOAD_CHIMU_0213_PLUG LOAD_CHIMU_0213_{PRECSUFFIX}') - define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}') - define(F'LOAD_CHIMU(x)') -else: - #define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)') - define(F'LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') - -if PREFETCH: - define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') - define(F'PF_GAUGE(A)') - define(F'PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)') -# define(F'PREFETCH1_CHIMU(A)') - define(F'PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)') -# define(F'PREFETCH_CHIMU(A)') -else: - define(F'PREFETCH_CHIMU_L1(A)') - define(F'PREFETCH_GAUGE_L1(A)') - define(F'PREFETCH_CHIMU_L2(A)') - define(F'PREFETCH_GAUGE_L2(A)') - define(F'PF_GAUGE(A)') - define(F'PREFETCH1_CHIMU(A)') - define(F'PREFETCH_CHIMU(A)') - define(F'PREFETCH_RESULT_L2_STORE(A)') - -# standard defines -define(F'LOCK_GAUGE(A)') -define(F'UNLOCK_GAUGE(A)') -define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') -define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)') -define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)') -define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}') -define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)') -# don't need zero psi, everything is done in recons -#define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}') -define(F'ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)') -# loads projections -define(F'XP_PROJ XP_PROJ_{PRECSUFFIX}') -define(F'YP_PROJ YP_PROJ_{PRECSUFFIX}') -define(F'ZP_PROJ ZP_PROJ_{PRECSUFFIX}') -define(F'TP_PROJ TP_PROJ_{PRECSUFFIX}') -define(F'XM_PROJ XM_PROJ_{PRECSUFFIX}') -define(F'YM_PROJ YM_PROJ_{PRECSUFFIX}') -define(F'ZM_PROJ ZM_PROJ_{PRECSUFFIX}') -define(F'TM_PROJ TM_PROJ_{PRECSUFFIX}') -# recons -define(F'XP_RECON XP_RECON_{PRECSUFFIX}') -define(F'XM_RECON XM_RECON_{PRECSUFFIX}') -define(F'XM_RECON_ACCUM XM_RECON_ACCUM_{PRECSUFFIX}') -define(F'YM_RECON_ACCUM YM_RECON_ACCUM_{PRECSUFFIX}') -define(F'ZM_RECON_ACCUM ZM_RECON_ACCUM_{PRECSUFFIX}') -define(F'TM_RECON_ACCUM TM_RECON_ACCUM_{PRECSUFFIX}') -define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}') -define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}') -define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}') -define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}') -# new permutes -define(F'PERMUTE_DIR0 0') -define(F'PERMUTE_DIR1 1') -define(F'PERMUTE_DIR2 2') -define(F'PERMUTE_DIR3 3') -define(F'PERMUTE PERMUTE_{PRECSUFFIX};') -# load table -#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}') -if PRECISION == 'double': - define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}') - define(F'MAYBEPERM(Dir,perm) if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}') -else: - define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}') - define(F'MAYBEPERM(A,perm) if (perm) {{ PERMUTE; }}') - - - -write('// DECLARATIONS') -definemultiline(F'DECLARATIONS_{PRECSUFFIX}') -# debugging register -if d['debug'] == True: - write(' Simd debugreg; \\') -# perm tables -if PRECISION == 'double': - write(' const uint64_t lut[4][8] = { \\') - write(' {4, 5, 6, 7, 0, 1, 2, 3}, \\') #0 = swap register halves - write(' {2, 3, 0, 1, 6, 7, 4, 5}, \\') #1 = swap halves of halves - write(' {1, 0, 3, 2, 5, 4, 7, 6}, \\') #2 = swap re/im - write(' {0, 1, 2, 4, 5, 6, 7, 8} };\\') #3 = identity -else: - write(' const uint32_t lut[4][16] = { \\') - write(' {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \\') #0 = swap register halves - write(' {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \\') #1 = swap halves of halves - write(' {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \\') #2 = swap halves of halves of halves - write(' {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \\') #3 = swap re/im - -#newline(target='A') -result_00.declare() -result_01.declare() -result_02.declare() -result_10.declare() -result_11.declare() -result_12.declare() -result_20.declare() -result_21.declare() -result_22.declare() -result_30.declare() -result_31.declare() -result_32.declare() # 12 -Chi_00.declare() -Chi_01.declare() -Chi_02.declare() -Chi_10.declare() -Chi_11.declare() -Chi_12.declare() # 6 -UChi_00.declare() -UChi_01.declare() -UChi_02.declare() -UChi_10.declare() -UChi_11.declare() -UChi_12.declare() # 6 -U_00.declare() -U_10.declare() -U_20.declare() -U_01.declare() -U_11.declare() -U_21.declare() # 6 -> 30 regs - -# all predications true -pg1.declare() -if PRECISION == 'double': - pg1.movestr('svptrue_b64()') -else: - pg1.movestr('svptrue_b32()') - -# tables -if PRECISION == 'double': - write(' svuint64_t table0; \\', target='I') # -> 31 regs -else: - write(' svuint32_t table0; \\', target='I') # -> 31 regs - -zero0.declare() - -# zero register -asmopen() -zero0.zero(zeroreg=True) -asmclose() -newline() - -define('Chimu_00 Chi_00', target='I') -define('Chimu_01 Chi_01', target='I') -define('Chimu_02 Chi_02', target='I') -define('Chimu_10 Chi_10', target='I') -define('Chimu_11 Chi_11', target='I') -define('Chimu_12 Chi_12', target='I') -if ALTERNATIVE_REGISTER_MAPPING == False: - define('Chimu_20 UChi_00', target='I') - define('Chimu_21 UChi_01', target='I') - define('Chimu_22 UChi_02', target='I') - define('Chimu_30 UChi_10', target='I') - define('Chimu_31 UChi_11', target='I') - define('Chimu_32 UChi_12', target='I') -else: # wilson4.h - define('Chimu_20 U_00', target='I') - define('Chimu_21 U_10', target='I') - define('Chimu_22 U_20', target='I') - define('Chimu_30 U_01', target='I') - define('Chimu_31 U_11', target='I') - define('Chimu_32 U_21', target='I') -newline() - - -d['cycles_RESULT'] += 12 -write('// RESULT') -definemultiline(F'RESULT_{PRECSUFFIX}(base)') -if ASM_STORE: - curlyopen() - #write(' SiteSpinor & ref(out[ss]); \\') - asmopen() - #pg1.loadpredication() - #store_base_ptr("&ref[0][0]") - #store_base_ptr(F"&ref[{STORE_BASE_PTR_COLOR_OFFSET}][0]") - store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') - store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - result_00.store("ref[0][0]") - result_01.store("ref[0][1]") - result_02.store("ref[0][2]") - result_10.store("ref[1][0]") - result_11.store("ref[1][1]") - result_12.store("ref[1][2]") - result_20.store("ref[2][0]") - result_21.store("ref[2][1]") - result_22.store("ref[2][2]") - result_30.store("ref[3][0]") - result_31.store("ref[3][1]") - result_32.store("ref[3][2]") - asmclose() - debugall('RESULT', group='result') - curlyclose() -newline() - -# prefetch spinors from memory into L2 cache -d['factor'] = 0 -d['cycles_PREFETCH_L2'] += 0 * d['factor'] -write('// PREFETCH_CHIMU_L2 (prefetch to L2)') -definemultiline(F'PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -#pg1.loadpredication() -#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") -fetch_base_ptr(F"base", target='A') -prefetch_L2(F"base", 0) -prefetch_L2(F"base", 1) -prefetch_L2(F"base", 2) -asmclose() -curlyclose() -newline() - -# prefetch spinors from memory into L1 cache -d['factor'] = 0 -d['cycles_PREFETCH_L1'] += 0 * d['factor'] -write('// PREFETCH_CHIMU_L1 (prefetch to L1)') -definemultiline(F'PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -#pg1.loadpredication() -fetch_base_ptr(F"base", target='A') -prefetch_L1(F"base", 0) -prefetch_L1(F"base", 1) -prefetch_L1(F"base", 2) -asmclose() -curlyclose() -newline() - -# prefetch gauge from memory into L2 cache -d['factor'] = 0 -d['cycles_PREFETCH_L2'] += 0 * d['factor'] -write('// PREFETCH_GAUGE_L2 (prefetch to L2)') -definemultiline(F'PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') -curlyopen() -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') -else: - write(' const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') -asmopen() -#pg1.loadpredication() -#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") -fetch_base_ptr(F"baseU", target='A') -prefetch_L2(F"baseU", -1) -prefetch_L2(F"baseU", 0) -prefetch_L2(F"baseU", 1) -prefetch_L2(F"baseU", 2) -prefetch_L2(F"baseU", 3) -prefetch_L2(F"baseU", 4) -prefetch_L2(F"baseU", 5) -prefetch_L2(F"baseU", 6) -prefetch_L2(F"baseU", 7) -#prefetch_L2(F"baseU", 8) -asmclose() -curlyclose() -newline() - -# prefetch gauge from memory into L1 cache -d['factor'] = 0 -d['cycles_PREFETCH_L1'] += 0 * d['factor'] -write('// PREFETCH_GAUGE_L1 (prefetch to L1)') -definemultiline(F'PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') -curlyopen() -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') -else: - write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') -asmopen() -#pg1.loadpredication() -#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") -fetch_base_ptr(F"baseU", target='A') -prefetch_L1(F"baseU", 0) -prefetch_L1(F"baseU", 1) -prefetch_L1(F"baseU", 2) -asmclose() -curlyclose() -newline() - -d['factor'] = 0 -write('// LOAD_CHI') -definemultiline(F'LOAD_CHI_{PRECSUFFIX}(base)') -if ASM_LOAD_CHIMU: - curlyopen() - #write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') - #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - fetch_base_ptr(F"base", target='I') - fetch_base_ptr(F"base", target='A') - - Chi_00.load("ref[0][0]", offset=0) - Chi_01.load("ref[0][1]", offset=0) - Chi_02.load("ref[0][2]", offset=0) - Chi_10.load("ref[1][0]", offset=0) - Chi_11.load("ref[1][1]", offset=0) - Chi_12.load("ref[1][2]", offset=0) - asmclose() - debugall('LOAD_CHI', group='Chi') - curlyclose() -newline() - - - -d['factor'] = 8 -# 12 loads = 12 issues, load latency = 8+1 cycles -# (not perfectly clear to me from docs) -d['cycles_LOAD_CHIMU'] += 11 * d['factor'] -write('// LOAD_CHIMU') -definemultiline(F'LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') -if ASM_LOAD_CHIMU: - curlyopen() - #write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - pg1.loadpredication() - #fetch_base_ptr("&ref[0][0]") - #fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") - fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') - fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - # Chimu_00.load("ref[0][0]") - # Chimu_01.load("ref[0][1]") - # Chimu_02.load("ref[0][2]") - # Chimu_10.load("ref[1][0]") - # Chimu_11.load("ref[1][1]") - # Chimu_12.load("ref[1][2]") - # Chimu_20.load("ref[2][0]") - # Chimu_21.load("ref[2][1]") - # Chimu_22.load("ref[2][2]") - # Chimu_30.load("ref[3][0]") - # Chimu_31.load("ref[3][1]") - # Chimu_32.load("ref[3][2]") - - Chimu_00.load("ref[0][0]") # minimum penalty for all directions - Chimu_30.load("ref[3][0]") - Chimu_10.load("ref[1][0]") - Chimu_20.load("ref[2][0]") - - Chimu_01.load("ref[0][1]") - Chimu_31.load("ref[3][1]") - Chimu_11.load("ref[1][1]") - Chimu_21.load("ref[2][1]") - - Chimu_02.load("ref[0][2]") - Chimu_32.load("ref[3][2]") - Chimu_12.load("ref[1][2]") - Chimu_22.load("ref[2][2]") - asmclose() - debugall('LOAD_CHIMU', group='Chimu') - curlyclose() -newline() - -# alternative load chimu: dirac order 0213 -# placed into asm (...) -d['factor'] = 0 -d['cycles_LOAD_CHIMU'] += 11 * d['factor'] -write('// LOAD_CHIMU_0213') -definemultiline(F'LOAD_CHIMU_0213_{PRECSUFFIX}') -if ASM_LOAD_CHIMU: - curlyopen() - write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - pg1.loadpredication() - fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") - Chimu_00.load("ref[0][0]") # reordered - Chimu_20.load("ref[2][0]") - - Chimu_01.load("ref[0][1]") - Chimu_21.load("ref[2][1]") - - Chimu_02.load("ref[0][2]") - Chimu_22.load("ref[2][2]") - - Chimu_10.load("ref[1][0]") - Chimu_30.load("ref[3][0]") - - Chimu_11.load("ref[1][1]") - Chimu_31.load("ref[3][1]") - - Chimu_12.load("ref[1][2]") - Chimu_32.load("ref[3][2]") - asmclose() - debugall('LOAD_CHIMU_0213', group='Chimu') - curlyclose() -newline() - -# alternative load chimu: dirac order 0312 -# placed into asm (...) -d['factor'] = 0 -d['cycles_LOAD_CHIMU'] += 11 * d['factor'] -write('// LOAD_CHIMU_0312') -definemultiline(F'LOAD_CHIMU_0312_{PRECSUFFIX}') -if ASM_LOAD_CHIMU: - curlyopen() - write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - pg1.loadpredication() - fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") - Chimu_00.load("ref[0][0]") # reordered - Chimu_30.load("ref[3][0]") - - Chimu_01.load("ref[0][1]") - Chimu_31.load("ref[3][1]") - - Chimu_02.load("ref[0][2]") - Chimu_32.load("ref[3][2]") - - Chimu_10.load("ref[1][0]") - Chimu_20.load("ref[2][0]") - - Chimu_11.load("ref[1][1]") - Chimu_21.load("ref[2][1]") - - Chimu_12.load("ref[1][2]") - Chimu_22.load("ref[2][2]") - asmclose() - debugall('LOAD_CHIMU_0312', group='Chimu') - curlyclose() -newline() - -d['factor'] = 2 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE0') -definemultiline(F'LOAD_TABLE0') -asmopen() -table0.loadtable(0) -asmclose() -newline() - -d['factor'] = 2 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE1') -definemultiline(F'LOAD_TABLE1') -asmopen() -table0.loadtable(1) -asmclose() -newline() - -d['factor'] = 2 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE2') -definemultiline(F'LOAD_TABLE2') -asmopen() -table0.loadtable(2) -asmclose() -newline() - -d['factor'] = 0 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE3') -definemultiline(F'LOAD_TABLE3') -asmopen() -table0.loadtable(3) -asmclose() -newline() - -d['factor'] = 2 # factor is 2 -d['cycles_PERM'] += 6 * d['factor'] -write('// PERMUTE') -definemultiline(F'PERMUTE_{PRECSUFFIX}') -debugall('PERM PRE', group='Chi') -asmopen() -#table0.loadtable(2) -Chi_00.permute(2, table0) -Chi_01.permute(2, table0) -Chi_02.permute(2, table0) -Chi_10.permute(2, table0) -Chi_11.permute(2, table0) -Chi_12.permute(2, table0) -asmclose() -debugall('PERM POST', group='Chi') -newline() - -write('// LOAD_GAUGE') -definemultiline(F'LOAD_GAUGE') -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') -else: - write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') -curlyopen() -asmopen() -pg1.loadpredication() -fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') -if ASM_LOAD_GAUGE: - fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - U_00.load("ref[0][0]") - U_10.load("ref[1][0]") - U_20.load("ref[2][0]") - U_01.load("ref[0][1]") - U_11.load("ref[1][1]") - U_21.load("ref[2][1]") -asmclose() -curlyclose() -newline() - -d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total -# assume all U loads are hidden -# FCMLA issue latency = 2 cycles -# measurement: latency = 16 cycles if FULLY pipelined !? -# spec says 6+6+9 cycles -# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9 -d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor'] -write('// MULT_2SPIN') -definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)') -curlyopen() -#write(' const auto & ref(U[sU][A]); \\') -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') -else: - write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') -asmopen() -#pg1.loadpredication() -#fetch_base_ptr("&ref[0][0]") -fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') -fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') -#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='I') -#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='A') -#fetch_base_ptr(F"&ref[0][{FETCH_BASE_PTR_COLOR_OFFSET}]") -if ASM_LOAD_GAUGE: - U_00.load("ref[0][0]") - U_10.load("ref[1][0]") - U_20.load("ref[2][0]") - U_01.load("ref[0][1]") - U_11.load("ref[1][1]") - U_21.load("ref[2][1]") - -if MOVPRFX == False: - UChi_00.zero() # implementation specific - UChi_10.zero() - UChi_01.zero() - UChi_11.zero() - UChi_02.zero() - UChi_12.zero() - - # round 1 - UChi_00.mul0(U_00, Chi_00) # FCMLA latency is 6+6+9 cycles - UChi_10.mul0(U_00, Chi_10) - UChi_01.mul0(U_10, Chi_00) - UChi_11.mul0(U_10, Chi_10) - UChi_02.mul0(U_20, Chi_00) - UChi_12.mul0(U_20, Chi_10) -else: - # round 1 - UChi_00.mul0(zero0, U_00, Chi_00, constructive=True) # FCMLA latency is 6+6+9 cycles - UChi_10.mul0(zero0, U_00, Chi_10, constructive=True) - UChi_01.mul0(zero0, U_10, Chi_00, constructive=True) - UChi_11.mul0(zero0, U_10, Chi_10, constructive=True) - UChi_02.mul0(zero0, U_20, Chi_00, constructive=True) - UChi_12.mul0(zero0, U_20, Chi_10, constructive=True) - -# round 2 -UChi_00.mul1(U_00, Chi_00) -UChi_10.mul1(U_00, Chi_10) -UChi_01.mul1(U_10, Chi_00) -UChi_11.mul1(U_10, Chi_10) -UChi_02.mul1(U_20, Chi_00) -UChi_12.mul1(U_20, Chi_10) # Chi_00 and Chi_10 available from here - -if ASM_LOAD_GAUGE: - U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded - U_10.load("ref[1][2]") # early load - U_20.load("ref[2][2]") # A --> -asmclose() -debugall('MULT_2SPIN_1', group='UChi') -curlyclose() -newline() - -write('// MULT_2SPIN_BACKEND') -definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}') -curlyopen() -asmopen() -# round 3 -UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and -UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90) -UChi_01.mac0(U_11, Chi_01) # autonomously using intrinsics -UChi_11.mac0(U_11, Chi_11) -UChi_02.mac0(U_21, Chi_01) -UChi_12.mac0(U_21, Chi_11) -# round 4 -UChi_00.mac1(U_01, Chi_01) -UChi_10.mac1(U_01, Chi_11) -UChi_01.mac1(U_11, Chi_01) -UChi_11.mac1(U_11, Chi_11) -UChi_02.mac1(U_21, Chi_01) -UChi_12.mac1(U_21, Chi_11) -# round 5 -UChi_00.mac0(U_00, Chi_02) # <-- A -UChi_10.mac0(U_00, Chi_12) -UChi_01.mac0(U_10, Chi_02) -UChi_11.mac0(U_10, Chi_12) -UChi_02.mac0(U_20, Chi_02) -UChi_12.mac0(U_20, Chi_12) -# round 6 -UChi_00.mac1(U_00, Chi_02) -UChi_10.mac1(U_00, Chi_12) -UChi_01.mac1(U_10, Chi_02) -UChi_11.mac1(U_10, Chi_12) -UChi_02.mac1(U_20, Chi_02) -UChi_12.mac1(U_20, Chi_12) -asmclose() -debugall('MULT_2SPIN_2', group='UChi') -curlyclose() -newline() - - -#// hspin(0)=fspin(0)+timesI(fspin(3)); -#// hspin(1)=fspin(1)+timesI(fspin(2)); -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// XP_PROJ') -definemultiline(F'XP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.addTimesI(Chimu_00, Chimu_30) -Chi_01.addTimesI(Chimu_01, Chimu_31) -Chi_02.addTimesI(Chimu_02, Chimu_32) -Chi_10.addTimesI(Chimu_10, Chimu_20) -Chi_11.addTimesI(Chimu_11, Chimu_21) -Chi_12.addTimesI(Chimu_12, Chimu_22) -asmclose() -debugall('XP_PROJ', group='Chi') -curlyclose() -newline() - -#// fspin(0)=hspin(0); -#// fspin(1)=hspin(1); -#// fspin(2)=timesMinusI(hspin(1)); -#// fspin(3)=timesMinusI(hspin(0)); -# does not occur in GridBench -d['factor'] = 0 -d['cycles_RECON'] += 15 * d['factor'] -write('// XP_RECON') -definemultiline(F'XP_RECON_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -if MOVPRFX == False: - result_20.zero() - result_21.zero() - result_22.zero() - result_30.zero() - result_31.zero() - result_32.zero() - - result_20.subTimesI(UChi_10) - result_21.subTimesI(UChi_11) - result_22.subTimesI(UChi_12) - result_30.subTimesI(UChi_00) - result_31.subTimesI(UChi_01) - result_32.subTimesI(UChi_02) -else: - result_20.subTimesI(zero0, UChi_10, constructive=True) - result_21.subTimesI(zero0, UChi_11, constructive=True) - result_22.subTimesI(zero0, UChi_12, constructive=True) - result_30.subTimesI(zero0, UChi_00, constructive=True) - result_31.subTimesI(zero0, UChi_01, constructive=True) - result_32.subTimesI(zero0, UChi_02, constructive=True) - -result_00.move(UChi_00) # don't reorder ! -result_01.move(UChi_01) -result_02.move(UChi_02) -result_10.move(UChi_10) -result_11.move(UChi_11) -result_12.move(UChi_12) - -# result_00.add(UChi_00) # faster than move? -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -asmclose() -debugall('XP_RECON', group='result') -newline() - - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_RECON'] += 15 * d['factor'] -write('// XP_RECON_ACCUM') -definemultiline(F'XP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_20.subTimesI(UChi_10) -# result_21.subTimesI(UChi_11) -# result_22.subTimesI(UChi_12) -# result_30.subTimesI(UChi_00) -# result_31.subTimesI(UChi_01) -# result_32.subTimesI(UChi_02) -# -# result_00.add(UChi_00) # reordered -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) - -result_30.subTimesI(UChi_00) # reordered -result_00.add(UChi_00) - -result_31.subTimesI(UChi_01) -result_01.add(UChi_01) - -result_32.subTimesI(UChi_02) -result_02.add(UChi_02) - -result_20.subTimesI(UChi_10) -result_10.add(UChi_10) - -result_21.subTimesI(UChi_11) -result_11.add(UChi_11) - -result_22.subTimesI(UChi_12) -result_12.add(UChi_12) -asmclose() -debugall('XP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// YP_PROJ') -definemultiline(F'YP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.sub(Chimu_00, Chimu_30) -Chi_01.sub(Chimu_01, Chimu_31) -Chi_02.sub(Chimu_02, Chimu_32) -Chi_10.add(Chimu_10, Chimu_20) -Chi_11.add(Chimu_11, Chimu_21) -Chi_12.add(Chimu_12, Chimu_22) -asmclose() -debugall('YP_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// ZP_PROJ') -definemultiline(F'ZP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.addTimesI(Chimu_00, Chimu_20) -Chi_01.addTimesI(Chimu_01, Chimu_21) -Chi_02.addTimesI(Chimu_02, Chimu_22) -Chi_10.subTimesI(Chimu_10, Chimu_30) -Chi_11.subTimesI(Chimu_11, Chimu_31) -Chi_12.subTimesI(Chimu_12, Chimu_32) -asmclose() -debugall('ZP_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// TP_PROJ') -definemultiline(F'TP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.add(Chimu_00, Chimu_20) -Chi_01.add(Chimu_01, Chimu_21) -Chi_02.add(Chimu_02, Chimu_22) -Chi_10.add(Chimu_10, Chimu_30) -Chi_11.add(Chimu_11, Chimu_31) -Chi_12.add(Chimu_12, Chimu_32) -asmclose() -debugall('TP_PROJ', group='Chi') -curlyclose() -newline() - -#// hspin(0)=fspin(0)-timesI(fspin(3)); -#// hspin(1)=fspin(1)-timesI(fspin(2)); - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// XM_PROJ') -definemultiline(F'XM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.subTimesI(Chimu_00, Chimu_30) -Chi_01.subTimesI(Chimu_01, Chimu_31) -Chi_02.subTimesI(Chimu_02, Chimu_32) -Chi_10.subTimesI(Chimu_10, Chimu_20) -Chi_11.subTimesI(Chimu_11, Chimu_21) -Chi_12.subTimesI(Chimu_12, Chimu_22) -asmclose() -debugall('XM_PROJ sub', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 15 * d['factor'] -write('// XM_RECON') -definemultiline(F'XM_RECON_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() - -# only necessary if not zeroed before -if MOVPRFX == False: - result_20.zero() - result_21.zero() - result_22.zero() - result_30.zero() - result_31.zero() - result_32.zero() - - result_20.addTimesI(UChi_10) # <-- - result_21.addTimesI(UChi_11) - result_22.addTimesI(UChi_12) - result_30.addTimesI(UChi_00) - result_31.addTimesI(UChi_01) - result_32.addTimesI(UChi_02) -else: - result_20.addTimesI(zero0, UChi_10, constructive=True) # <-- - result_21.addTimesI(zero0, UChi_11, constructive=True) - result_22.addTimesI(zero0, UChi_12, constructive=True) - result_30.addTimesI(zero0, UChi_00, constructive=True) - result_31.addTimesI(zero0, UChi_01, constructive=True) - result_32.addTimesI(zero0, UChi_02, constructive=True) - -result_00.move(UChi_00) -result_01.move(UChi_01) -result_02.move(UChi_02) -result_10.move(UChi_10) -result_11.move(UChi_11) -result_12.move(UChi_12) -asmclose() -debugall('XM_RECON result', group='result') -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// YM_PROJ') -definemultiline(F'YM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.add(Chimu_00, Chimu_30) -Chi_01.add(Chimu_01, Chimu_31) -Chi_02.add(Chimu_02, Chimu_32) -Chi_10.sub(Chimu_10, Chimu_20) -Chi_11.sub(Chimu_11, Chimu_21) -Chi_12.sub(Chimu_12, Chimu_22) -asmclose() -debugall('YM_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// ZM_PROJ') -definemultiline(F'ZM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.subTimesI(Chimu_00, Chimu_20) -Chi_01.subTimesI(Chimu_01, Chimu_21) -Chi_02.subTimesI(Chimu_02, Chimu_22) -Chi_10.addTimesI(Chimu_10, Chimu_30) -Chi_11.addTimesI(Chimu_11, Chimu_31) -Chi_12.addTimesI(Chimu_12, Chimu_32) -asmclose() -debugall('ZM_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// TM_PROJ') -definemultiline(F'TM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -pg1.loadpredication() -Chi_00.sub(Chimu_00, Chimu_20) -Chi_01.sub(Chimu_01, Chimu_21) -Chi_02.sub(Chimu_02, Chimu_22) -Chi_10.sub(Chimu_10, Chimu_30) -Chi_11.sub(Chimu_11, Chimu_31) -Chi_12.sub(Chimu_12, Chimu_32) -asmclose() -debugall('TM_PROJ', group='Chi') -curlyclose() -newline() - -# does not occur in GridBench -d['factor'] = 0 -# add/sub issue latency = 1, latency is 9 -d['cycles_RECON'] += 15 * d['factor'] -write('// XM_RECON_ACCUM') -definemultiline(F'XM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -# result_20.addTimesI(UChi_10) -# result_21.addTimesI(UChi_11) -# result_22.addTimesI(UChi_12) -# result_30.addTimesI(UChi_00) -# result_31.addTimesI(UChi_01) -# result_32.addTimesI(UChi_02) -# -# # result_00.move(UChi_00) -# # result_01.move(UChi_01) -# # result_02.move(UChi_02) -# # result_10.move(UChi_10) -# # result_11.move(UChi_11) -# # result_12.move(UChi_12) -# -# # faster than move ? -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) - -result_30.addTimesI(UChi_00) # reordered -result_31.addTimesI(UChi_01) -result_32.addTimesI(UChi_02) - -result_20.addTimesI(UChi_10) -result_21.addTimesI(UChi_11) -result_22.addTimesI(UChi_12) - -result_00.add(UChi_00) -result_01.add(UChi_01) -result_02.add(UChi_02) -result_10.add(UChi_10) -result_11.add(UChi_11) -result_12.add(UChi_12) -asmclose() -debugall('XM_RECON_ACCUM', group='result') -newline() - - - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// YP_RECON_ACCUM') -definemultiline(F'YP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.add(UChi_10) -# result_21.add(UChi_11) -# result_22.add(UChi_12) -# result_30.sub(UChi_00) -# result_31.sub(UChi_01) -# result_32.sub(UChi_02) - -result_00.add(UChi_00) # reordered -result_30.sub(UChi_00) - -result_01.add(UChi_01) -result_31.sub(UChi_01) - -result_02.add(UChi_02) -result_32.sub(UChi_02) - -result_10.add(UChi_10) -result_20.add(UChi_10) - -result_11.add(UChi_11) -result_21.add(UChi_11) - -result_12.add(UChi_12) -result_22.add(UChi_12) -asmclose() -debugall('YP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// YM_RECON_ACCUM') -definemultiline(F'YM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.sub(UChi_10) -# result_21.sub(UChi_11) -# result_22.sub(UChi_12) -# result_30.add(UChi_00) -# result_31.add(UChi_01) -# result_32.add(UChi_02) - -result_00.add(UChi_00) # reordered -result_30.add(UChi_00) - -result_01.add(UChi_01) -result_31.add(UChi_01) - -result_02.add(UChi_02) -result_32.add(UChi_02) - -result_10.add(UChi_10) -result_20.sub(UChi_10) - -result_11.add(UChi_11) -result_21.sub(UChi_11) - -result_12.add(UChi_12) -result_22.sub(UChi_12) -asmclose() -debugall('YM_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 15 * d['factor'] -write('// ZP_RECON_ACCUM') -definemultiline(F'ZP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_20.subTimesI(UChi_00) -# result_21.subTimesI(UChi_01) -# result_22.subTimesI(UChi_02) -# result_30.addTimesI(UChi_10) -# result_31.addTimesI(UChi_11) -# result_32.addTimesI(UChi_12) -# -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -result_20.subTimesI(UChi_00) # reordered -result_00.add(UChi_00) - -result_21.subTimesI(UChi_01) -result_01.add(UChi_01) - -result_22.subTimesI(UChi_02) -result_02.add(UChi_02) - -result_30.addTimesI(UChi_10) -result_10.add(UChi_10) - -result_31.addTimesI(UChi_11) -result_11.add(UChi_11) - -result_32.addTimesI(UChi_12) -result_12.add(UChi_12) -asmclose() -debugall('ZP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 15 * d['factor'] -write('// ZM_RECON_ACCUM') -definemultiline(F'ZM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_20.addTimesI(UChi_00) -# result_21.addTimesI(UChi_01) -# result_22.addTimesI(UChi_02) -# result_30.subTimesI(UChi_10) -# result_31.subTimesI(UChi_11) -# result_32.subTimesI(UChi_12) -# -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -result_20.addTimesI(UChi_00) # reordered -result_00.add(UChi_00) - -result_21.addTimesI(UChi_01) -result_01.add(UChi_01) - -result_22.addTimesI(UChi_02) -result_02.add(UChi_02) - -result_30.subTimesI(UChi_10) -result_10.add(UChi_10) - -result_31.subTimesI(UChi_11) -result_11.add(UChi_11) - -result_32.subTimesI(UChi_12) -result_12.add(UChi_12) -asmclose() -debugall('ZM_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// TP_RECON_ACCUM') -definemultiline(F'TP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.add(UChi_00) -# result_21.add(UChi_01) -# result_22.add(UChi_02) -# result_30.add(UChi_10) -# result_31.add(UChi_11) -# result_32.add(UChi_12) - -result_00.add(UChi_00) # reordered -result_20.add(UChi_00) - -result_01.add(UChi_01) -result_21.add(UChi_01) - -result_02.add(UChi_02) -result_22.add(UChi_02) - -result_10.add(UChi_10) -result_30.add(UChi_10) - -result_11.add(UChi_11) -result_31.add(UChi_11) - -result_12.add(UChi_12) -result_32.add(UChi_12) -asmclose() -debugall('TP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// TM_RECON_ACCUM') -definemultiline(F'TM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.sub(UChi_00) -# result_21.sub(UChi_01) -# result_22.sub(UChi_02) -# result_30.sub(UChi_10) -# result_31.sub(UChi_11) -# result_32.sub(UChi_12) - -result_00.add(UChi_00) # reordered -result_20.sub(UChi_00) - -result_01.add(UChi_01) -result_21.sub(UChi_01) - -result_02.add(UChi_02) -result_22.sub(UChi_02) - -result_10.add(UChi_10) -result_30.sub(UChi_10) - -result_11.add(UChi_11) -result_31.sub(UChi_11) - -result_12.add(UChi_12) -result_32.sub(UChi_12) -asmclose() -debugall('TM_RECON_ACCUM', group='result') -newline() - -d['factor'] = 0 -# have 12 instructions -# picking dual issue versions -d['cycles_ZERO_PSI'] += 6 * d['factor'] -write('// ZERO_PSI') -definemultiline(F'ZERO_PSI_{PRECSUFFIX}') -asmopen() -pg1.loadpredication() -result_00.zero() -result_01.zero() -result_02.zero() -result_10.zero() -result_11.zero() -result_12.zero() -result_20.zero() -result_21.zero() -result_22.zero() -result_30.zero() -result_31.zero() -result_32.zero() -asmclose() -#debugall('ZERO_PSI', group='result') -newline() - -# prefetch store spinors to L2 cache -d['factor'] = 0 -d['cycles_PREFETCH_L2'] += 0 * d['factor'] -write('// PREFETCH_RESULT_L2_STORE (prefetch store to L2)') -definemultiline(F'PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -fetch_base_ptr(F"base", target='A') -prefetch_L2_store(F"base", 0) -prefetch_L2_store(F"base", 1) -prefetch_L2_store(F"base", 2) -asmclose() -curlyclose() -newline() - -# prefetch store spinors to L1 cache -d['factor'] = 0 -d['cycles_PREFETCH_L1'] += 0 * d['factor'] -write('// PREFETCH_RESULT_L1_STORE (prefetch store to L1)') -definemultiline(F'PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -fetch_base_ptr(F"base", target='A') -prefetch_L1_store(F"base", 0) -prefetch_L1_store(F"base", 1) -prefetch_L1_store(F"base", 2) -asmclose() -curlyclose() -newline() - - -d['factor'] = 0 -write('// ADD_RESULT_INTERNAL') -definemultiline(F'ADD_RESULT_INTERNAL_{PRECSUFFIX}') -asmopen() -result_00.add(Chimu_00) -result_01.add(Chimu_01) -result_02.add(Chimu_02) -result_10.add(Chimu_10) -result_11.add(Chimu_11) -result_12.add(Chimu_12) -result_20.add(Chimu_20) -result_21.add(Chimu_21) -result_22.add(Chimu_22) -result_30.add(Chimu_30) -result_31.add(Chimu_31) -result_32.add(Chimu_32) -asmclose() -#debugall('ZERO_PSI', group='result') -newline() - -# -------------------------------------------------------------------------------- - -# C -f = open('w.h', 'w') -f.write(d['C']) -f.close() - -# intrin -f = open('wi.h', 'w') -f.write(d['I']) -f.close() - -filename = '' -if PRECISION == 'double': - filename = "Fujitsu_A64FX_intrin_double.h" -else: - filename = "Fujitsu_A64FX_intrin_single.h" -f = open(filename, 'w') -f.write(LEGAL.format(filename)) -f.write(d['I']) -f.close() - - -# asm -f = open('wa.h', 'w') -f.write(d['A']) -f.close() - -filename = '' -if PRECISION == 'double': - filename = "Fujitsu_A64FX_asm_double.h" -else: - filename = "Fujitsu_A64FX_asm_single.h" -f = open(filename, 'w') -f.write(LEGAL.format(filename)) -f.write(d['A']) -f.close() - - -# arithmetics instruction count, mul/mac = 2 instructions each -d['acount'] = d['add'] + d['sub'] + \ - d['mul'] + d['mac'] + d['addTimesI'] + d['subTimesI'] - -# permutations -d['permutes'] += 2*d['timesI'] + 1*d['timesMinusI'] -d['neg'] = 1*d['timesI'] + 1*d['timesMinusI'] - -# instruction count, mul/mac = 2 instructions each, +/- *i = 3 instructions each -d['icount'] = d['load'] + d['store'] + d['move'] + d['add'] + d['sub'] + \ - d['mul'] + d['mac'] + d['permutes'] + d['neg'] + \ - d['addTimesI'] + d['subTimesI'] + d['zero'] + d['movprfx'] - -# flops -d['flops'] = 4*d['mac'] + 3*d['mul'] + d['add'] + d['sub'] + \ - d['addTimesI'] + d['subTimesI'] - - - - - -print('Statistics') -print('') -print('Type Occurences Total / Arith instructions') -print('-------------------------------------------------------------------') -print('Variables {:4d}'.format(d['registers'])) -print('') -print('load {:4d}'.format(d['load'])) -print('store {:4d}'.format(d['store'])) -print('move {:4d}'.format(d['move'])) -print('movprfx {:4d}'.format(d['movprfx'])) -print('zero {:4d}'.format(d['zero'])) -print('negate {:4d}'.format(d['neg'])) - - -print('add {:4d} {:0.2f} / {:0.2f}'.\ - format(d['add'], d['add'] / d['icount'], d['add'] / d['acount'])) -print('sub {:4d} {:0.2f} / {:0.2f}'.\ - format(d['sub'], d['sub'] / d['icount'], d['sub'] / d['acount'])) -print('mul {:4d} {:0.2f} / {:0.2f}'.\ - format(d['mul'], 2*d['mul'] / d['icount'], 2*d['mul'] / d['acount'])) -print('mac {:4d} {:0.2f} / {:0.2f}'.\ - format(d['mac'], 2*d['mac'] / d['icount'], 2*d['mac'] / d['acount'])) -print('addTimesI {:4d} {:0.2f} / {:0.2f}'.\ - format(d['addTimesI'], 2*d['addTimesI'] / d['icount'], 2*d['addTimesI'] / d['acount'])) -print('subTimesI {:4d} {:0.2f} / {:0.2f}'.\ - format(d['subTimesI'], 2*d['subTimesI'] / d['icount'], 2*d['subTimesI'] / d['acount'])) - -print('timesI {:4d}'.format(d['timesI'])) -print('timesMinusI {:4d}'.format(d['timesMinusI'])) -print('permutes {:4d} {:0.2f}'.\ - format(d['permutes'], d['permutes'] / d['icount'])) -print('') -print('flops {:4d}'.format(d['flops'])) -print('instruction count {:4d}'.format(d['icount'])) -print('arith. instruction count {:4d} {:0.2f}'.\ - format(d['acount'], d['acount'] / d['icount'])) - - -# ---- static pipeline resources consumption ---- -FLA = 0 -FLA += 2 * d['mac'] + 2 * d['mul'] -FLA += 1 * d['addTimesI'] + 1 * d['subTimesI'] -FLA += 1 * d['move'] -FLA += 1 * d['permutes'] -FLA += 1 * d['store'] -FLA += 1 * d['zero'] - -FLB = 0 -FLB += 1 * d['addTimesI'] + 1 * d['subTimesI'] - -FLAB = 0 -FLAB += 1 * d['mac'] + 1 * d['mul'] -FLAB += 1 * d['add'] + 1 * d['sub'] -FLAB += 1 * d['neg'] + 1 * d['movprfx'] -#FLAB += 1 * d['zero'] - - -FL_slots = 2 * d['icount'] -FL_micro_ops = FLA + FLB + FLAB - -print('') -print('------------------------------------------------------------------') -print('') -print('Static FL slot usage') -print('') -print(' FLA {:4d}'.format(FLA)) -print(' FLB {:4d}'.format(FLB)) -print(' FLA/B {:4d}'.format(FLAB)) - -print('') -print('Static FL slot efficiency') -print('') -print(' Total FL slots {:4d}'.format(FL_slots)) -print(' FL slots occupied {:4d}'.format(FL_micro_ops)) -print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / FL_slots)) - -cycles_total = d['cycles_ZERO_PSI'] + d['cycles_LOAD_CHIMU'] + \ - d['cycles_PROJ'] + d['cycles_PERM'] + d['cycles_MULT_2SPIN'] + \ - d['cycles_RECON'] + d['cycles_RESULT'] -cycles_total_hidden = d['cycles_ZERO_PSI'] + \ - d['cycles_PROJ'] + d['cycles_MULT_2SPIN'] + \ - d['cycles_RECON'] - -# ---- dynamic estimate ---- - -print('') -print('Dynamic cycles estimate (incl. latencies)') -print('') -print(' ZERO_PSI {:4d}'.format(d['cycles_ZERO_PSI'])) -print(' LOAD_CHIMU {:4d}'.format(d['cycles_LOAD_CHIMU'])) -print(' PROJ {:4d}'.format(d['cycles_PROJ'])) -print(' PERM {:4d}'.format(d['cycles_PERM'])) -print(' MULT_2SPIN {:4d}'.format(d['cycles_MULT_2SPIN'])) -print(' RECON {:4d}'.format(d['cycles_RECON'])) -print(' STORE {:4d}'.format(d['cycles_RESULT'])) -print('') -print(' Sum {:4d}'.format(cycles_total)) -print('') -print(' Sum* {:4d}'.format(cycles_total_hidden)) -print(' Total FL slots* {:4d}'.format(cycles_total_hidden * 2)) -print(' FL slots occupied* {:4d}'.format(FL_micro_ops)) -print(' FL slot efficiency* {:0.2f}'.format(FL_micro_ops / (2*cycles_total_hidden))) -print('') -print(' *load/store/PERM hidden') - -estimated_cycles = cycles_total_hidden -# Estimate percent peak DP; dual issue, fma -pp = 100 * 4 * d['flops'] / (2*2*8*estimated_cycles) -print('') -print('Model prediction') -print('') -print(' Cycles* {:4d}'.format(estimated_cycles)) -print(' Percent peak* {:4.1f} %'.format(pp)) - -# estimated RF throughput in GB/s @ 2.2 GHz -tp10 = (d['load'] + d['store']) * 64 * 2.2 / estimated_cycles -tp2 = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / estimated_cycles -print('') -print(' Estimated RF throughput* {:4.1f} GB/s'.\ - format(tp10)) -print(' Estimated RF throughput* {:4.1f} GiB/s'.\ - format(tp2)) - -# ---- dynamic pipeline resources consumption ---- - -runtime = measured_cycles # runtime in cycles -pp_runtime = 100 * 4 * d['flops'] / (2*2*8*runtime) -runtime_FL_slots = 2 * runtime -delta = runtime - estimated_cycles - - -print('') -print('------------------------------------------------------------------') -print('') -print('Dynamic runtime analysis (cycles from measurements)') -print('') -print(' Cycles {:4d}'.format(runtime)) -print(' Percent peak {:4.1f} %'.format(pp_runtime)) -print(' Deviation from estimate {:4d} {:4.2f} %'.\ - format(delta, 100. * abs(delta/runtime))) -print(' Deviation per direction {:4.1f}'.format(delta/8)) - -# estimated RF throughput in GB/s @ 2.2 GHz -tp10_rt = (d['load'] + d['store']) * 64 * 2.2 / runtime -tp2_rt = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / runtime -print('') -print(' RF throughput {:4.1f} GB/s'.\ - format(tp10_rt)) -print(' RF throughput {:4.1f} GiB/s'.\ - format(tp2_rt)) -print('') -print(' Total FL slots {:4d}'.format(runtime_FL_slots)) -print(' FL slots occupied {:4d}'.format(FL_micro_ops)) -print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / runtime_FL_slots)) -print('') From 909acd55cd36c4b567cab30d311aab6b8674288d Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 02:00:22 +0100 Subject: [PATCH 03/16] vnum variant for prefetches --- Grid/simd/Fujitsu_A64FX_intrin_double.h | 36 ++++++++++++------------- Grid/simd/Fujitsu_A64FX_intrin_single.h | 36 ++++++++++++------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 361246fc..f195e3c5 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -144,38 +144,38 @@ Author: Nils Meyer // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXd(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 30273b6e..0b874f02 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -144,38 +144,38 @@ Author: Nils Meyer // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXf(base) \ From 4b882e8056b2c9dd6dceab2729104e5e615835ae Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 03:09:20 +0100 Subject: [PATCH 04/16] fixed lost bracket --- Grid/simd/Fujitsu_A64FX_intrin_double.h | 36 ++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index f195e3c5..b645c365 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -144,38 +144,38 @@ Author: Nils Meyer // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ { \ - svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ { \ - svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL1STRM); \ - svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL1STRM); \ - svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)-4), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)12), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)16), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)20), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)24), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)28), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL1STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL1STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXd(base) \ From 6013183361d88fe7179b4fcf6b8321c0621b09ba Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 03:25:01 +0100 Subject: [PATCH 05/16] removed Asm impls --- Grid/simd/Fujitsu_A64FX_asm_double.h | 781 --------------------------- Grid/simd/Fujitsu_A64FX_asm_single.h | 781 --------------------------- 2 files changed, 1562 deletions(-) delete mode 100644 Grid/simd/Fujitsu_A64FX_asm_double.h delete mode 100644 Grid/simd/Fujitsu_A64FX_asm_single.h diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h deleted file mode 100644 index bbc4efe7..00000000 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ /dev/null @@ -1,781 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: Fujitsu_A64FX_asm_double.h - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base) -#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) -#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) -#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) -#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) -#define PF_GAUGE(A) -#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A) -#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A) -#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define LOCK_GAUGE(A) -#define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXd -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); -#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) -#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd -#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) -#define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) -#define XP_PROJ XP_PROJ_A64FXd -#define YP_PROJ YP_PROJ_A64FXd -#define ZP_PROJ ZP_PROJ_A64FXd -#define TP_PROJ TP_PROJ_A64FXd -#define XM_PROJ XM_PROJ_A64FXd -#define YM_PROJ YM_PROJ_A64FXd -#define ZM_PROJ ZM_PROJ_A64FXd -#define TM_PROJ TM_PROJ_A64FXd -#define XP_RECON XP_RECON_A64FXd -#define XM_RECON XM_RECON_A64FXd -#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd -#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd -#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd -#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd -#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd -#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd -#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd -#define PERMUTE_DIR0 0 -#define PERMUTE_DIR1 1 -#define PERMUTE_DIR2 2 -#define PERMUTE_DIR3 3 -#define PERMUTE PERMUTE_A64FXd; -#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; } -#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } -// DECLARATIONS -#define DECLARATIONS_A64FXd \ - uint64_t baseU; \ - const uint64_t lut[4][8] = { \ - {4, 5, 6, 7, 0, 1, 2, 3}, \ - {2, 3, 0, 1, 6, 7, 4, 5}, \ - {1, 0, 3, 2, 5, 4, 7, 6}, \ - {0, 1, 2, 4, 5, 6, 7, 8} };\ -asm ( \ - "ptrue p5.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -asm ( \ - "fmov z31.d , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// RESULT -#define RESULT_A64FXd(base) \ -{ \ -asm ( \ - "str z0, [%[storeptr], -6, mul vl] \n\t" \ - "str z1, [%[storeptr], -5, mul vl] \n\t" \ - "str z2, [%[storeptr], -4, mul vl] \n\t" \ - "str z3, [%[storeptr], -3, mul vl] \n\t" \ - "str z4, [%[storeptr], -2, mul vl] \n\t" \ - "str z5, [%[storeptr], -1, mul vl] \n\t" \ - "str z6, [%[storeptr], 0, mul vl] \n\t" \ - "str z7, [%[storeptr], 1, mul vl] \n\t" \ - "str z8, [%[storeptr], 2, mul vl] \n\t" \ - "str z9, [%[storeptr], 3, mul vl] \n\t" \ - "str z10, [%[storeptr], 4, mul vl] \n\t" \ - "str z11, [%[storeptr], 5, mul vl] \n\t" \ - : \ - : [storeptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L2 (prefetch to L2) -#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L1 (prefetch to L1) -#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L2 (prefetch to L2) -#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ -{ \ - const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHI -#define LOAD_CHI_A64FXd(base) \ -{ \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU -#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ -{ \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0213 -#define LOAD_CHIMU_0213_A64FXd \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0312 -#define LOAD_CHIMU_0312_A64FXd \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_TABLE0 -#define LOAD_TABLE0 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE1 -#define LOAD_TABLE1 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE2 -#define LOAD_TABLE2 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE3 -#define LOAD_TABLE3 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERMUTE -#define PERMUTE_A64FXd \ -asm ( \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_GAUGE -#define LOAD_GAUGE(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN -#define MULT_2SPIN_1_A64FXd(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "movprfx z18.d, p5/m, z31.d \n\t" \ - "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ - "movprfx z21.d, p5/m, z31.d \n\t" \ - "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \ - "movprfx z19.d, p5/m, z31.d \n\t" \ - "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \ - "movprfx z22.d, p5/m, z31.d \n\t" \ - "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \ - "movprfx z20.d, p5/m, z31.d \n\t" \ - "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \ - "movprfx z23.d, p5/m, z31.d \n\t" \ - "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN_BACKEND -#define MULT_2SPIN_2_A64FXd \ -{ \ -asm ( \ - "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ - "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ - "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ - "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \ - "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \ - "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_PROJ -#define XP_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_RECON -#define XP_RECON_A64FXd \ -asm ( \ - "movprfx z6.d, p5/m, z31.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ - "movprfx z7.d, p5/m, z31.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ - "movprfx z8.d, p5/m, z31.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ - "movprfx z9.d, p5/m, z31.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ - "movprfx z10.d, p5/m, z31.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ - "movprfx z11.d, p5/m, z31.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ - "mov z0.d, p5/m, z18.d \n\t" \ - "mov z1.d, p5/m, z19.d \n\t" \ - "mov z2.d, p5/m, z20.d \n\t" \ - "mov z3.d, p5/m, z21.d \n\t" \ - "mov z4.d, p5/m, z22.d \n\t" \ - "mov z5.d, p5/m, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// XP_RECON_ACCUM -#define XP_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_PROJ -#define YP_PROJ_A64FXd \ -{ \ -asm ( \ - "fsub z12.d, p5/m, z12.d, z21.d \n\t" \ - "fsub z13.d, p5/m, z13.d, z22.d \n\t" \ - "fsub z14.d, p5/m, z14.d, z23.d \n\t" \ - "fadd z15.d, p5/m, z15.d, z18.d \n\t" \ - "fadd z16.d, p5/m, z16.d, z19.d \n\t" \ - "fadd z17.d, p5/m, z17.d, z20.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZP_PROJ -#define ZP_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TP_PROJ -#define TP_PROJ_A64FXd \ -{ \ -asm ( \ - "fadd z12.d, p5/m, z12.d, z18.d \n\t" \ - "fadd z13.d, p5/m, z13.d, z19.d \n\t" \ - "fadd z14.d, p5/m, z14.d, z20.d \n\t" \ - "fadd z15.d, p5/m, z15.d, z21.d \n\t" \ - "fadd z16.d, p5/m, z16.d, z22.d \n\t" \ - "fadd z17.d, p5/m, z17.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_PROJ -#define XM_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON -#define XM_RECON_A64FXd \ -asm ( \ - "movprfx z6.d, p5/m, z31.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ - "movprfx z7.d, p5/m, z31.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ - "movprfx z8.d, p5/m, z31.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ - "movprfx z9.d, p5/m, z31.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ - "movprfx z10.d, p5/m, z31.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ - "movprfx z11.d, p5/m, z31.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ - "mov z0.d, p5/m, z18.d \n\t" \ - "mov z1.d, p5/m, z19.d \n\t" \ - "mov z2.d, p5/m, z20.d \n\t" \ - "mov z3.d, p5/m, z21.d \n\t" \ - "mov z4.d, p5/m, z22.d \n\t" \ - "mov z5.d, p5/m, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_PROJ -#define YM_PROJ_A64FXd \ -{ \ -asm ( \ - "fadd z12.d, p5/m, z12.d, z21.d \n\t" \ - "fadd z13.d, p5/m, z13.d, z22.d \n\t" \ - "fadd z14.d, p5/m, z14.d, z23.d \n\t" \ - "fsub z15.d, p5/m, z15.d, z18.d \n\t" \ - "fsub z16.d, p5/m, z16.d, z19.d \n\t" \ - "fsub z17.d, p5/m, z17.d, z20.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZM_PROJ -#define ZM_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TM_PROJ -#define TM_PROJ_A64FXd \ -{ \ -asm ( \ - "fsub z12.d, p5/m, z12.d, z18.d \n\t" \ - "fsub z13.d, p5/m, z13.d, z19.d \n\t" \ - "fsub z14.d, p5/m, z14.d, z20.d \n\t" \ - "fsub z15.d, p5/m, z15.d, z21.d \n\t" \ - "fsub z16.d, p5/m, z16.d, z22.d \n\t" \ - "fsub z17.d, p5/m, z17.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON_ACCUM -#define XM_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_RECON_ACCUM -#define YP_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fsub z9.d, p5/m, z9.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fsub z10.d, p5/m, z10.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fsub z11.d, p5/m, z11.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_RECON_ACCUM -#define YM_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fsub z6.d, p5/m, z6.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fsub z7.d, p5/m, z7.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fsub z8.d, p5/m, z8.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZP_RECON_ACCUM -#define ZP_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZM_RECON_ACCUM -#define ZM_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TP_RECON_ACCUM -#define TP_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TM_RECON_ACCUM -#define TM_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fsub z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fsub z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fsub z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fsub z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fsub z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fsub z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZERO_PSI -#define ZERO_PSI_A64FXd \ -asm ( \ - "fmov z0.d , 0 \n\t" \ - "fmov z1.d , 0 \n\t" \ - "fmov z2.d , 0 \n\t" \ - "fmov z3.d , 0 \n\t" \ - "fmov z4.d , 0 \n\t" \ - "fmov z5.d , 0 \n\t" \ - "fmov z6.d , 0 \n\t" \ - "fmov z7.d , 0 \n\t" \ - "fmov z8.d , 0 \n\t" \ - "fmov z9.d , 0 \n\t" \ - "fmov z10.d , 0 \n\t" \ - "fmov z11.d , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) -#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "dc zva, %[fetchptr]\n\t" \ - "dc zva, %[fetchptr]\n\t" \ - "dc zva, %[fetchptr]\n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_RESULT_L1_STORE (prefetch store to L1) -#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// ADD_RESULT_INTERNAL -#define ADD_RESULT_INTERNAL_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z12.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z13.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z14.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z15.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z16.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z17.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h deleted file mode 100644 index e629f617..00000000 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ /dev/null @@ -1,781 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: Fujitsu_A64FX_asm_single.h - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base) -#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) -#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) -#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) -#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) -#define PF_GAUGE(A) -#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A) -#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A) -#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define LOCK_GAUGE(A) -#define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXf -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); -#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) -#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf -#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) -#define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) -#define XP_PROJ XP_PROJ_A64FXf -#define YP_PROJ YP_PROJ_A64FXf -#define ZP_PROJ ZP_PROJ_A64FXf -#define TP_PROJ TP_PROJ_A64FXf -#define XM_PROJ XM_PROJ_A64FXf -#define YM_PROJ YM_PROJ_A64FXf -#define ZM_PROJ ZM_PROJ_A64FXf -#define TM_PROJ TM_PROJ_A64FXf -#define XP_RECON XP_RECON_A64FXf -#define XM_RECON XM_RECON_A64FXf -#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf -#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf -#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf -#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf -#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf -#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf -#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf -#define PERMUTE_DIR0 0 -#define PERMUTE_DIR1 1 -#define PERMUTE_DIR2 2 -#define PERMUTE_DIR3 3 -#define PERMUTE PERMUTE_A64FXf; -#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; } -#define MAYBEPERM(A,perm) if (perm) { PERMUTE; } -// DECLARATIONS -#define DECLARATIONS_A64FXf \ - uint64_t baseU; \ - const uint32_t lut[4][16] = { \ - {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ - {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ - {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ - {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ -asm ( \ - "ptrue p5.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -asm ( \ - "fmov z31.s , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// RESULT -#define RESULT_A64FXf(base) \ -{ \ -asm ( \ - "str z0, [%[storeptr], -6, mul vl] \n\t" \ - "str z1, [%[storeptr], -5, mul vl] \n\t" \ - "str z2, [%[storeptr], -4, mul vl] \n\t" \ - "str z3, [%[storeptr], -3, mul vl] \n\t" \ - "str z4, [%[storeptr], -2, mul vl] \n\t" \ - "str z5, [%[storeptr], -1, mul vl] \n\t" \ - "str z6, [%[storeptr], 0, mul vl] \n\t" \ - "str z7, [%[storeptr], 1, mul vl] \n\t" \ - "str z8, [%[storeptr], 2, mul vl] \n\t" \ - "str z9, [%[storeptr], 3, mul vl] \n\t" \ - "str z10, [%[storeptr], 4, mul vl] \n\t" \ - "str z11, [%[storeptr], 5, mul vl] \n\t" \ - : \ - : [storeptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L2 (prefetch to L2) -#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L1 (prefetch to L1) -#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L2 (prefetch to L2) -#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ -{ \ - const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHI -#define LOAD_CHI_A64FXf(base) \ -{ \ -asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU -#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ -{ \ -asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0213 -#define LOAD_CHIMU_0213_A64FXf \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0312 -#define LOAD_CHIMU_0312_A64FXf \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_TABLE0 -#define LOAD_TABLE0 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE1 -#define LOAD_TABLE1 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE2 -#define LOAD_TABLE2 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE3 -#define LOAD_TABLE3 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERMUTE -#define PERMUTE_A64FXf \ -asm ( \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_GAUGE -#define LOAD_GAUGE(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN -#define MULT_2SPIN_1_A64FXf(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "movprfx z18.s, p5/m, z31.s \n\t" \ - "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ - "movprfx z21.s, p5/m, z31.s \n\t" \ - "fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \ - "movprfx z19.s, p5/m, z31.s \n\t" \ - "fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \ - "movprfx z22.s, p5/m, z31.s \n\t" \ - "fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \ - "movprfx z20.s, p5/m, z31.s \n\t" \ - "fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \ - "movprfx z23.s, p5/m, z31.s \n\t" \ - "fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \ - "fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \ - "fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \ - "fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \ - "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ - "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ - "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ - "ld1w { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN_BACKEND -#define MULT_2SPIN_2_A64FXf \ -{ \ -asm ( \ - "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ - "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ - "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ - "fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \ - "fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \ - "fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \ - "fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \ - "fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \ - "fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \ - "fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \ - "fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \ - "fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \ - "fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \ - "fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \ - "fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \ - "fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \ - "fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \ - "fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \ - "fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \ - "fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \ - "fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \ - "fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \ - "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \ - "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_PROJ -#define XP_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_RECON -#define XP_RECON_A64FXf \ -asm ( \ - "movprfx z6.s, p5/m, z31.s \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ - "movprfx z7.s, p5/m, z31.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ - "movprfx z8.s, p5/m, z31.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ - "movprfx z9.s, p5/m, z31.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ - "movprfx z10.s, p5/m, z31.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ - "movprfx z11.s, p5/m, z31.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ - "mov z0.s, p5/m, z18.s \n\t" \ - "mov z1.s, p5/m, z19.s \n\t" \ - "mov z2.s, p5/m, z20.s \n\t" \ - "mov z3.s, p5/m, z21.s \n\t" \ - "mov z4.s, p5/m, z22.s \n\t" \ - "mov z5.s, p5/m, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// XP_RECON_ACCUM -#define XP_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_PROJ -#define YP_PROJ_A64FXf \ -{ \ -asm ( \ - "fsub z12.s, p5/m, z12.s, z21.s \n\t" \ - "fsub z13.s, p5/m, z13.s, z22.s \n\t" \ - "fsub z14.s, p5/m, z14.s, z23.s \n\t" \ - "fadd z15.s, p5/m, z15.s, z18.s \n\t" \ - "fadd z16.s, p5/m, z16.s, z19.s \n\t" \ - "fadd z17.s, p5/m, z17.s, z20.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZP_PROJ -#define ZP_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TP_PROJ -#define TP_PROJ_A64FXf \ -{ \ -asm ( \ - "fadd z12.s, p5/m, z12.s, z18.s \n\t" \ - "fadd z13.s, p5/m, z13.s, z19.s \n\t" \ - "fadd z14.s, p5/m, z14.s, z20.s \n\t" \ - "fadd z15.s, p5/m, z15.s, z21.s \n\t" \ - "fadd z16.s, p5/m, z16.s, z22.s \n\t" \ - "fadd z17.s, p5/m, z17.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_PROJ -#define XM_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON -#define XM_RECON_A64FXf \ -asm ( \ - "movprfx z6.s, p5/m, z31.s \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ - "movprfx z7.s, p5/m, z31.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ - "movprfx z8.s, p5/m, z31.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ - "movprfx z9.s, p5/m, z31.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ - "movprfx z10.s, p5/m, z31.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ - "movprfx z11.s, p5/m, z31.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ - "mov z0.s, p5/m, z18.s \n\t" \ - "mov z1.s, p5/m, z19.s \n\t" \ - "mov z2.s, p5/m, z20.s \n\t" \ - "mov z3.s, p5/m, z21.s \n\t" \ - "mov z4.s, p5/m, z22.s \n\t" \ - "mov z5.s, p5/m, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_PROJ -#define YM_PROJ_A64FXf \ -{ \ -asm ( \ - "fadd z12.s, p5/m, z12.s, z21.s \n\t" \ - "fadd z13.s, p5/m, z13.s, z22.s \n\t" \ - "fadd z14.s, p5/m, z14.s, z23.s \n\t" \ - "fsub z15.s, p5/m, z15.s, z18.s \n\t" \ - "fsub z16.s, p5/m, z16.s, z19.s \n\t" \ - "fsub z17.s, p5/m, z17.s, z20.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZM_PROJ -#define ZM_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TM_PROJ -#define TM_PROJ_A64FXf \ -{ \ -asm ( \ - "fsub z12.s, p5/m, z12.s, z18.s \n\t" \ - "fsub z13.s, p5/m, z13.s, z19.s \n\t" \ - "fsub z14.s, p5/m, z14.s, z20.s \n\t" \ - "fsub z15.s, p5/m, z15.s, z21.s \n\t" \ - "fsub z16.s, p5/m, z16.s, z22.s \n\t" \ - "fsub z17.s, p5/m, z17.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON_ACCUM -#define XM_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_RECON_ACCUM -#define YP_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fsub z9.s, p5/m, z9.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fsub z10.s, p5/m, z10.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fsub z11.s, p5/m, z11.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_RECON_ACCUM -#define YM_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fsub z6.s, p5/m, z6.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fsub z7.s, p5/m, z7.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fsub z8.s, p5/m, z8.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZP_RECON_ACCUM -#define ZP_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZM_RECON_ACCUM -#define ZM_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TP_RECON_ACCUM -#define TP_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TM_RECON_ACCUM -#define TM_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fsub z6.s, p5/m, z6.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fsub z7.s, p5/m, z7.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fsub z8.s, p5/m, z8.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fsub z9.s, p5/m, z9.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fsub z10.s, p5/m, z10.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fsub z11.s, p5/m, z11.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZERO_PSI -#define ZERO_PSI_A64FXf \ -asm ( \ - "fmov z0.s , 0 \n\t" \ - "fmov z1.s , 0 \n\t" \ - "fmov z2.s , 0 \n\t" \ - "fmov z3.s , 0 \n\t" \ - "fmov z4.s , 0 \n\t" \ - "fmov z5.s , 0 \n\t" \ - "fmov z6.s , 0 \n\t" \ - "fmov z7.s , 0 \n\t" \ - "fmov z8.s , 0 \n\t" \ - "fmov z9.s , 0 \n\t" \ - "fmov z10.s , 0 \n\t" \ - "fmov z11.s , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) -#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "dc zva, %[fetchptr]\n\t" \ - "dc zva, %[fetchptr]\n\t" \ - "dc zva, %[fetchptr]\n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_RESULT_L1_STORE (prefetch store to L1) -#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// ADD_RESULT_INTERNAL -#define ADD_RESULT_INTERNAL_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z12.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z13.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z14.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z15.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z16.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z17.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - From 45d49d86487427ea1e0b34c0d530d475f8e3e31a Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 03:35:18 +0100 Subject: [PATCH 06/16] clean up --- .../implementation/WilsonKernelsAsmBodyA64FX.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 83588a7d..4e463438 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -26,9 +26,9 @@ Author: Nils Meyer Regensburg University *************************************************************************************/ /* END LEGAL */ -// GCC 10 messes up SVE instruction scheduling using -O3 only, -// using -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders -// performance is better than armclang 20.2 +// GCC 10 messes up SVE instruction scheduling using -O3, but +// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders +// performance now is better than armclang 20.2 #ifdef KERNEL_DAG #define DIR0_PROJ XP_PROJ @@ -118,10 +118,6 @@ Author: Nils Meyer Regensburg University /* NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty though I expected that it would improve on performance - - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ */ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ @@ -149,7 +145,7 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty if ( local || st.same_node[Dir] ) { \ MULT_2SPIN_1(Dir); \ MULT_2SPIN_2; \ - RECON; \ + RECON; \ } \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ PREFETCH_CHIMU(base); \ @@ -300,7 +296,7 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty // DC ZVA test // { uint64_t basestore = (uint64_t)&out[ss]; - // PREFETCH_RESULT_L2_STORE(basestore); } + // PREFETCH_RESULT_L2_STORE(basestore); } ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); @@ -336,8 +332,8 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty // DC ZVA test //{ uint64_t basestore = (uint64_t)&out[ss]; - // PREFETCH_RESULT_L2_STORE(basestore); - //} + // PREFETCH_RESULT_L2_STORE(basestore); } + ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON); From a4afc3ea2aeb23a5a5a4dece03087e6344c9986b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 14 Jan 2021 20:44:16 -0500 Subject: [PATCH 07/16] Red black coarse space --- tests/solver/Test_dwf_hdcr.cc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index 8e083231..f68e99ab 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -222,9 +222,16 @@ int main (int argc, char ** argv) GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d); - GridCartesian *CoarseCoarse4d = SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; + + + GridCartesian *CoarseCoarse4d = SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridCartesian *CoarseCoarse5d = SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d); + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d); + GridRedBlackCartesian *CoarseCoarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseCoarse4d); + GridRedBlackCartesian *CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d); + std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); std::vector cseeds({5,6,7,8}); @@ -282,8 +289,7 @@ int main (int argc, char ** argv) Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); - Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); - + Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); ////////////////////////////////////////////////// // Deflate the course space. Recursive multigrid? @@ -311,12 +317,11 @@ int main (int argc, char ** argv) } } - Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix + Level2Op L2Op(*CoarseCoarse5d,*CoarseCoarse5dRB,1); // Hermitian matrix typedef Level2Op::CoarseVector CoarseCoarseVector; HermitianLinearOperator L1LinOp(LDOp); L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); - std::cout< Date: Thu, 14 Jan 2021 20:46:21 -0500 Subject: [PATCH 08/16] Coarsened vector test --- Grid/qcd/QCD.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h index 76d7def4..858aead7 100644 --- a/Grid/qcd/QCD.h +++ b/Grid/qcd/QCD.h @@ -80,6 +80,13 @@ template struct isSpinor { template using IfSpinor = Invoke::value,int> > ; template using IfNotSpinor = Invoke::value,int> > ; +const int CoarseIndex = 4; +template struct isCoarsened { + static constexpr bool value = (CoarseIndex<=T::TensorLevel); +}; +template using IfCoarsened = Invoke::value,int> > ; +template using IfNotCoarsened = Invoke::value,int> > ; + // ChrisK very keen to add extra space for Gparity doubling. // // Also add domain wall index, in a way where Wilson operator From eaff0f3aeb05635d49e17cb6e271621040f5b7f1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 14 Jan 2021 20:46:58 -0500 Subject: [PATCH 09/16] Gamma5 on coaree spaces --- Grid/qcd/spin/TwoSpinor.h | 179 ++++++++------------------------------ 1 file changed, 35 insertions(+), 144 deletions(-) diff --git a/Grid/qcd/spin/TwoSpinor.h b/Grid/qcd/spin/TwoSpinor.h index 924594ab..8dad0cd0 100644 --- a/Grid/qcd/spin/TwoSpinor.h +++ b/Grid/qcd/spin/TwoSpinor.h @@ -128,7 +128,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spProjTm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; hspin(0)=fspin(0)-fspin(2); hspin(1)=fspin(1)-fspin(3); } @@ -138,40 +137,50 @@ template > = 0> accelerator_inline void s * 0 0 -1 0 * 0 0 0 -1 */ - template > = 0> accelerator_inline void spProj5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; hspin(0)=fspin(0); hspin(1)=fspin(1); } template > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; hspin(0)=fspin(2); hspin(1)=fspin(3); } -// template accelerator_inline void fspProj5p (iVector &rfspin,const iVector &fspin) template > = 0> accelerator_inline void spProj5p (iVector &rfspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; rfspin(0)=fspin(0); rfspin(1)=fspin(1); rfspin(2)=Zero(); rfspin(3)=Zero(); } -// template accelerator_inline void fspProj5m (iVector &rfspin,const iVector &fspin) template > = 0> accelerator_inline void spProj5m (iVector &rfspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; rfspin(0)=Zero(); rfspin(1)=Zero(); rfspin(2)=fspin(2); rfspin(3)=fspin(3); } +template > = 0> accelerator_inline void spProj5p (iVector &rfspin,const iVector &fspin) +{ + const int hN = N>>1; + for(int s=0;s > = 0> accelerator_inline void spProj5m (iVector &rfspin,const iVector &fspin) +{ + const int hN = N>>1; + for(int s=0;s > = 0> accelerator_inline void s */ template > = 0> accelerator_inline void spReconXp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=timesMinusI(hspin(1)); @@ -191,7 +199,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spReconXm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=timesI(hspin(1)); @@ -199,7 +206,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumReconXp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)-=timesI(hspin(1)); @@ -207,7 +213,6 @@ template > = 0> accelerator_inline void a } template > = 0> accelerator_inline void accumReconXm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)+=timesI(hspin(1)); @@ -221,7 +226,6 @@ template > = 0> accelerator_inline void a template > = 0> accelerator_inline void spReconYp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)= hspin(1); @@ -229,7 +233,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spReconYm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=-hspin(1); @@ -237,7 +240,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumReconYp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)+=hspin(1); @@ -245,7 +247,6 @@ template > = 0> accelerator_inline void a } template > = 0> accelerator_inline void accumReconYm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)-=hspin(1); @@ -260,7 +261,6 @@ template > = 0> accelerator_inline void a */ template > = 0> accelerator_inline void spReconZp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=timesMinusI(hspin(0)); @@ -268,7 +268,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spReconZm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)= timesI(hspin(0)); @@ -276,7 +275,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumReconZp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)-=timesI(hspin(0)); @@ -284,7 +282,6 @@ template > = 0> accelerator_inline void a } template > = 0> accelerator_inline void accumReconZm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)+=timesI(hspin(0)); @@ -298,7 +295,6 @@ template > = 0> accelerator_inline void a */ template > = 0> accelerator_inline void spReconTp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=hspin(0); @@ -306,7 +302,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spReconTm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=-hspin(0); @@ -314,7 +309,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumReconTp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)+=hspin(0); @@ -322,7 +316,6 @@ template > = 0> accelerator_inline void a } template > = 0> accelerator_inline void accumReconTm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)-=hspin(0); @@ -336,7 +329,6 @@ template > = 0> accelerator_inline void a */ template > = 0> accelerator_inline void spRecon5p (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though fspin(2)=Zero(); @@ -344,7 +336,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spRecon5m (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=Zero(); fspin(1)=Zero(); fspin(2)=hspin(0)+hspin(0); @@ -352,7 +343,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumRecon5p (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0)+hspin(0); fspin(1)+=hspin(1)+hspin(1); } @@ -372,7 +362,6 @@ template > = 0> accelerator_inline void a ////////// template > = 0> accelerator_inline void spProjXp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconXp (iM }} } - - //////// // Xm //////// template accelerator_inline void spProjXm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjXm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjXm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjXm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjXm (iMatri template accelerator_inline void spReconXm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconXm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconXm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconXm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconXm (iMatr template accelerator_inline void accumReconXm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconXm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconXm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconXm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjYp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjYp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjYp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjYp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjYp (iMatri template accelerator_inline void spReconYp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconYp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconYp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconYp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconYp (iMatr template accelerator_inline void accumReconYp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconYp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconYp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconYp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjYm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjYm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjYm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjYm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconYm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconYm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconYm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,const iVector >::type *temp; for(int i=0;i accelerator_inline void spReconYm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconYm (iMatr template accelerator_inline void accumReconYm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconYm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconYm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconYm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconYm (iM //////// template accelerator_inline void spProjZp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjZp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjZp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjZp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconZp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconZp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconZp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconZp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconZp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconZp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconZp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconZp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconZp (iM //////// template accelerator_inline void spProjZm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjZm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjZm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjZm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconZm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconZm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconZm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconZm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconZm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconZm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconZm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconZm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconZm (iM //////// template accelerator_inline void spProjTp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjTp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjTp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjTp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconTp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconTp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconTp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconTp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconTp (iMatr template accelerator_inline void accumReconTp (iScalar &hspin, iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconTp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconTp (iVector &hspin, const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconTp (iMatrix &hspin, const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjTm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjTm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjTm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjTm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjTm (iMatri template accelerator_inline void spReconTm (iScalar &hspin, const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconTm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconTm (iVector &hspin, const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconTm (iMatrix &hspin, const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconTm (iMatr template accelerator_inline void accumReconTm (iScalar &hspin, const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconTm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconTm (iVector &hspin, const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconTm (iMatrix &hspin, const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProj5p (iScalar &hspin,const iScalar &fspin) +template > = 0> accelerator_inline void spProj5p (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProj5p(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProj5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) +template > = 0> accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProj5p (iMatri template accelerator_inline void spRecon5p (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spRecon5p(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spRecon5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spRecon5p (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spRecon5p (iMatr template accelerator_inline void accumRecon5p (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumRecon5p(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumRecon5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumRecon5p (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumRecon5p (iM } // four spinor projectors for chiral proj -// template accelerator_inline void fspProj5p (iScalar &hspin,const iScalar &fspin) -template accelerator_inline void spProj5p (iScalar &hspin,const iScalar &fspin) +template > = 0> accelerator_inline void spProj5p (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProj5p(hspin._internal,fspin._internal); } -// template accelerator_inline void fspProj5p (iVector &hspin,iVector &fspin) -template > = 0> accelerator_inline void spProj5p (iVector &hspin,const iVector &fspin) +template > = 0,IfNotCoarsened > = 0> accelerator_inline void spProj5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void fspProj5p (iMatrix &hspin,iMatrix &fspin) -template accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) +template > = 0> accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProj5p (iMatrix & // 5m //////// -template accelerator_inline void spProj5m (iScalar &hspin,const iScalar &fspin) +template > = 0> accelerator_inline void spProj5m (iScalar &hspin,const iScalar &fspin) { spProj5m(hspin._internal,fspin._internal); } -template > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) +template > = 0,IfNotCoarsened > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) { for(int i=0;i accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) +template > = 0> accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) { for(int i=0;i accelerator_inline void spProj5m (iMatri template accelerator_inline void spRecon5m (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spRecon5m(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spRecon5m (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spRecon5m (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumRecon5m (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumRecon5m(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumRecon5m (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumRecon5m (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumRecon5m (iM // four spinor projectors for chiral proj -// template accelerator_inline void fspProj5m (iScalar &hspin,const iScalar &fspin) -template accelerator_inline void spProj5m (iScalar &hspin,const iScalar &fspin) +template > = 0> accelerator_inline void spProj5m (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProj5m(hspin._internal,fspin._internal); } -// template accelerator_inline void fspProj5m (iVector &hspin,iVector &fspin) -template > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) +template > = 0,IfNotCoarsened > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void fspProj5m (iMatrix &hspin,iMatrix &fspin) -template accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) +template > = 0> accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i Date: Thu, 14 Jan 2021 20:47:28 -0500 Subject: [PATCH 10/16] G5 on coarse spaces --- Grid/qcd/utils/LinalgUtils.h | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/Grid/qcd/utils/LinalgUtils.h b/Grid/qcd/utils/LinalgUtils.h index 1e016e4e..964b83d5 100644 --- a/Grid/qcd/utils/LinalgUtils.h +++ b/Grid/qcd/utils/LinalgUtils.h @@ -154,8 +154,8 @@ void axpby_ssp_pminus(Lattice &z,Coeff a,const Lattice &x,Coeff b,co accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; decltype(coalescedRead(y_v[ss+sp])) tmp; - spProj5m(tmp,y_v(ss+sp)); - tmp = a*x_v(ss+s)+b*tmp; + spProj5m(tmp,y_v(ss+sp)); + tmp = a*x_v(ss+s)+b*tmp; coalescedWrite(z_v[ss+s],tmp); }); } @@ -188,7 +188,6 @@ void G5R5(Lattice &z,const Lattice &x) z.Checkerboard() = x.Checkerboard(); conformable(x,z); int Ls = grid->_rdimensions[0]; - Gamma G5(Gamma::Algebra::Gamma5); autoView( x_v, x, AcceleratorRead); autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; @@ -196,7 +195,13 @@ void G5R5(Lattice &z,const Lattice &x) uint64_t ss = sss*Ls; for(int s=0;s &z, const Lattice &x) z.Checkerboard() = x.Checkerboard(); conformable(x, z); - Gamma G5(Gamma::Algebra::Gamma5); - z = G5 * x; + autoView( x_v, x, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); + uint64_t nloop = grid->oSites(); + accelerator_for(ss,nloop,vobj::Nsimd(),{ + auto tmp = x_v(ss); + decltype(tmp) tmp_p; + decltype(tmp) tmp_m; + spProj5p(tmp_p,tmp); + spProj5m(tmp_m,tmp); + coalescedWrite(z_v[ss],tmp_p - tmp_m); + }); } +/* template void G5C(Lattice> &z, const Lattice> &x) { @@ -234,6 +249,7 @@ void G5C(Lattice> &z, const Lattice Date: Thu, 14 Jan 2021 20:48:08 -0500 Subject: [PATCH 11/16] Red black support on coars --- Grid/algorithms/CoarsenedMatrix.h | 60 +++++++++++++++++++------------ 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 66b9c169..b9594678 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -775,7 +775,26 @@ public: for(int p=0;p FineComplexField; typedef typename Fobj::scalar_type scalar_type; + std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl; + FineComplexField one(FineGrid); one=scalar_type(1.0,0.0); FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0); @@ -847,11 +868,13 @@ public: CoarseScalar InnerProd(Grid()); + std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl; // Orthogonalise the subblocks over the basis blockOrthogonalise(InnerProd,Subspace.subspace); // Compute the matrix elements of linop between this orthonormal // set of vectors. + std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl; int self_stencil=-1; for(int p=0;poSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); + if ( hermitian && (disp==-1) ) { + for(int pp=0;pp = * + int dirp = geom.directions[pp]; + int dispp = geom.displacements[pp]; + if ( (dirp==dir) && (dispp==1) ){ + auto sft = conjugate(Cshift(oZProj,dir,1)); + autoView( sft_v , sft , AcceleratorWrite); + autoView( A_pp , A[pp], AcceleratorWrite); + accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); }); + } + } + } } } @@ -957,33 +992,12 @@ public: } if(hermitian) { std::cout << GridLogMessage << " ForceHermitian, new code "<lSites(); From 579595f547bd36775ba42ecd07f9a881e4f12e85 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 14 Jan 2021 20:48:35 -0500 Subject: [PATCH 12/16] Red black on coarse space --- tests/solver/Test_dwf_hdcr_2level.cc | 8 ++++++-- tests/solver/Test_dwf_multigrid.cc | 9 +++++++-- tests/solver/Test_hw_multigrid.cc | 4 +++- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/tests/solver/Test_dwf_hdcr_2level.cc b/tests/solver/Test_dwf_hdcr_2level.cc index df24c9d2..4fa1e302 100644 --- a/tests/solver/Test_dwf_hdcr_2level.cc +++ b/tests/solver/Test_dwf_hdcr_2level.cc @@ -262,6 +262,8 @@ int main (int argc, char ** argv) GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d); + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d); std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); @@ -328,7 +330,7 @@ int main (int argc, char ** argv) Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); - Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); + Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); std::cout< CoarseCG(0.01,1000); - ConjugateGradient CoarseCG(0.02,1000);// 14.7s + ConjugateGradient CoarseCG(0.01,2000);// 14.7s + eval.resize(0); + evec.resize(0,Coarse5d); DeflatedGuesser DeflCoarseGuesser(evec,eval); NormalEquations DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser); diff --git a/tests/solver/Test_dwf_multigrid.cc b/tests/solver/Test_dwf_multigrid.cc index 9e11c160..351e10fd 100644 --- a/tests/solver/Test_dwf_multigrid.cc +++ b/tests/solver/Test_dwf_multigrid.cc @@ -370,6 +370,11 @@ int main (int argc, char ** argv) GridCartesian *CoarseCoarse4d = SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; GridCartesian *CoarseCoarse5d = SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d); + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d); + GridRedBlackCartesian *CoarseCoarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseCoarse4d); + GridRedBlackCartesian *CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d); + std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); std::vector cseeds({5,6,7,8}); @@ -434,8 +439,8 @@ int main (int argc, char ** argv) std::cout< seeds({1,2,3,4}); GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds); @@ -335,7 +337,7 @@ int main (int argc, char ** argv) NonHermitianLinearOperator LinOpDwf(Ddwf); - Level1Op LDOp (*Coarse5d,0); + Level1Op LDOp (*Coarse5d,*Coarse5dRB,0); std::cout< Date: Thu, 14 Jan 2021 20:49:13 -0500 Subject: [PATCH 13/16] Red black coarse space --- tests/solver/Test_dwf_hdcr_16_rb.cc | 397 +++++ tests/solver/Test_dwf_hdcr_24_regression.cc | 477 ++++++ tests/solver/Test_dwf_hdcr_48_rb.cc | 397 +++++ tests/solver/Test_dwf_hdcr_48_regression.cc | 473 ++++++ tests/solver/Test_hw_multigrid_mixed_48.cc | 1287 ++++++++++++++++ tests/solver/Test_hw_multigrid_mixed_48_rb.cc | 1326 +++++++++++++++++ 6 files changed, 4357 insertions(+) create mode 100644 tests/solver/Test_dwf_hdcr_16_rb.cc create mode 100644 tests/solver/Test_dwf_hdcr_24_regression.cc create mode 100644 tests/solver/Test_dwf_hdcr_48_rb.cc create mode 100644 tests/solver/Test_dwf_hdcr_48_regression.cc create mode 100644 tests/solver/Test_hw_multigrid_mixed_48.cc create mode 100644 tests/solver/Test_hw_multigrid_mixed_48_rb.cc diff --git a/tests/solver/Test_dwf_hdcr_16_rb.cc b/tests/solver/Test_dwf_hdcr_16_rb.cc new file mode 100644 index 00000000..b7900b04 --- /dev/null +++ b/tests/solver/Test_dwf_hdcr_16_rb.cc @@ -0,0 +1,397 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_hdcr.cc + + Copyright (C) 2015 + +Author: Antonin Portelli +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class SolverWrapper : public LinearFunction { +private: + CheckerBoardedSparseMatrixBase & _Matrix; + SchurRedBlackBase & _Solver; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(CheckerBoardedSparseMatrixBase &Matrix, + SchurRedBlackBase &Solver) + : _Matrix(Matrix), _Solver(Solver) {}; + + void operator() (const Field &in, Field &out){ + + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); + const int nbasis= 32; + const int nbasisc= 32; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("./ckpoint_lat.4000"); + //std::string file("./ckpoint_lat.1000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + LatticeFermion A(FGrid); + LatticeFermion B(FGrid); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,150,0.0);// + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,1000,100,100,0.0); // Slightly faster + + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + std::cout << " Making 5D coarse RB grid " <,nbasisc> CoarseSubspace; + // CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< PosdefLdop(LDOp); + typedef Level2Op::CoarseVector CoarseCoarseVector; + CoarseVector c_src(Coarse5d); c_src=1.0; + + std::cout< , SolverWrapper > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + ChebyshevSmoother FineSmoother(0.5,60.0,12,HermIndefOp,Ddwf); + std::cout< CoarseZeroGuesser; + ConjugateGradient CoarseCG(0.005,1000); + // SchurDiagMooeeOperator CoarseMpcDagMpc(LDOp); + SchurRedBlackDiagMooeeSolve CoarseRBCG(CoarseCG); + SolverWrapper CoarseSolver(LDOp,CoarseRBCG); + + // NormalEquations CoarseCGNE(LDOp,CoarseCG,CoarseZeroGuesser); + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + CoarseSolver); + TwoLevelPrecon.Level(1); + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + l1PGCR(src,result); + + std::cout< pCG(1.0e-8,60000); + result=Zero(); + // pCG(HermDefOp,src,result); + + std::cout< HermOpEO(Ddwf); + // pCG(HermOpEO,src_o,result_o); + + std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + // std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); + + std::cout< +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); + const int nbasis= 40; + const int nbasisc= 40; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + // std::string file("./ckpoint_lat.4000"); + // std::string file("./ckpoint_lat.1000"); + // NerscIO::readConfiguration(Umu,header,file); + SU::HotConfiguration(RNG4,Umu); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + LatticeFermion A(FGrid); + LatticeFermion B(FGrid); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,400,50,50,0.0); // Slightly faster + + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + std::cout << " Making 5D coarse RB grid " <,nbasisc> CoarseSubspace; + // CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< PosdefLdop(LDOp); + /* + { + int nb=nbasisc/2; + CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,15.0,0.02,1000,800,100,0.0); + for(int n=0;noSites();site++){ + subspace_g5[site](nn) = subspace[site](nn); + subspace_g5[site](nn+nb)=-subspace[site](nn+nb); + } + } + } + } + */ + typedef Level2Op::CoarseVector CoarseCoarseVector; + /* + Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix + HermitianLinearOperator L1LinOp(LDOp); + L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); + + + std::cout< IRLHermOpL2(L2Op); + CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0; + */ + /* + Chebyshev IRLChebyL2(0.001,15.0,301); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + int cNk=24; + int cNm=36; + int cNstop=24; + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); + + int cNconv; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + IRLL2.calc(eval2,evec2,cc_src,cNconv); + + ConjugateGradient CoarseCoarseCG(0.1,1000); + DeflatedGuesser DeflCoarseCoarseGuesser(evec2,eval2); + NormalEquations DeflCoarseCoarseCGNE(L2Op,CoarseCoarseCG,DeflCoarseCoarseGuesser); + */ + + /* + std::cout< IRLHermOp(LDOp); + // Chebyshev IRLCheby(0.001,15.0,301); + Chebyshev IRLCheby(0.03,12.0,101); + FunctionHermOp IRLOpCheby(IRLCheby,IRLHermOp); + PlainHermOp IRLOp (IRLHermOp); + int Nk=64; + int Nm=128; + int Nstop=Nk; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + IRL.calc(eval,evec,c_src,Nconv); + */ + CoarseVector c_src(Coarse5d); c_src=1.0; + // DeflatedGuesser DeflCoarseGuesser(evec,eval); + // NormalEquations DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser); + + std::cout< , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + ChebyshevSmoother FineSmoother(0.25,60.0,12,HermIndefOp,Ddwf); + /* + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + ChebyshevSmoother CoarseSmoother(0.1,15.0,3,L1LinOp,LDOp); + + // MirsSmoother CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp); + // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); + + CoarseMG Level2Precon (CoarseAggregates, L2Op, + L1LinOp,LDOp, + CoarseSmoother, + DeflCoarseCoarseGuesser, + DeflCoarseCoarseCGNE); + Level2Precon.Level(2); + + // PGCR Applying this solver to solve the coarse space problem + PrecGeneralisedConjugateResidual l2PGCR(0.1, 100, L1LinOp,Level2Precon,16,16); + l2PGCR.Level(2); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + ZeroGuesser CoarseZeroGuesser; + ThreeLevelMG ThreeLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + l2PGCR); + ThreeLevelPrecon.Level(1); + + // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,1000,HermIndefOp,ThreeLevelPrecon,16,16); + l1PGCR.Level(1); + */ + std::cout< CoarseZeroGuesser; + ConjugateGradient CoarseCG(0.01,1000); + NormalEquations CoarseCGNE(LDOp,CoarseCG,CoarseZeroGuesser); + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + CoarseCGNE); + TwoLevelPrecon.Level(1); + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + l1PGCR(src,result); + + std::cout< pCG(1.0e-8,60000); + result=Zero(); + // pCG(HermDefOp,src,result); + + std::cout< HermOpEO(Ddwf); + // pCG(HermOpEO,src_o,result_o); + + std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + // std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); + + std::cout< +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class SolverWrapper : public LinearFunction { +private: + CheckerBoardedSparseMatrixBase & _Matrix; + SchurRedBlackBase & _Solver; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(CheckerBoardedSparseMatrixBase &Matrix, + SchurRedBlackBase &Solver) + : _Matrix(Matrix), _Solver(Solver) {}; + + void operator() (const Field &in, Field &out){ + + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + //std::vector block ({2,2,2,2}); + const int nbasis= 40; + const int nbasisc= 40; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + //std::string file("./ckpoint_lat.4000"); + std::string file("./ckpoint_lat.1000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + LatticeFermion A(FGrid); + LatticeFermion B(FGrid); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,1000,100,100,0.0); // Slightly faster + + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d); + + Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); + + ////////////////////////////////////////////////// + // Deflate the course space. Recursive multigrid? + ////////////////////////////////////////////////// + typedef Aggregation,nbasisc> CoarseSubspace; + // CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< PosdefLdop(LDOp); + typedef Level2Op::CoarseVector CoarseCoarseVector; + CoarseVector c_src(Coarse5d); c_src=1.0; + + std::cout< , SolverWrapper > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + std::cout< tols({0.015}); + std::vector ords({12}); + std::vector los({0.8}); + for(int l=0;l FineSmoother(los[l],60.0,ords[o],HermIndefOp,Ddwf); + ZeroGuesser CoarseZeroGuesser; + ConjugateGradient CoarseCG(tols[t],10000); + SchurRedBlackDiagMooeeSolve CoarseRBCG(CoarseCG); + SolverWrapper CoarseSolver(LDOp,CoarseRBCG); + + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + CoarseSolver); + TwoLevelPrecon.Level(1); + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + l1PGCR(src,result); + }}} + + ConjugateGradient pCG(1.0e-8,60000); + std::cout< HermOpEO(Ddwf); + pCG(HermOpEO,src_o,result_o); + + std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + // std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); + + std::cout< +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); + const int nbasis= 40; + const int nbasisc= 40; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + // std::string file("./ckpoint_lat.4000"); + std::string file("./ckpoint_lat.1000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + LatticeFermion A(FGrid); + LatticeFermion B(FGrid); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,1000,100,100,0.0); // Slightly faster + + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + std::cout << " Making 5D coarse RB grid " <,nbasisc> CoarseSubspace; + // CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< PosdefLdop(LDOp); + /* + { + int nb=nbasisc/2; + CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,15.0,0.02,1000,800,100,0.0); + for(int n=0;noSites();site++){ + subspace_g5[site](nn) = subspace[site](nn); + subspace_g5[site](nn+nb)=-subspace[site](nn+nb); + } + } + } + } + */ + typedef Level2Op::CoarseVector CoarseCoarseVector; + /* + Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix + HermitianLinearOperator L1LinOp(LDOp); + L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); + + + std::cout< IRLHermOpL2(L2Op); + CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0; + */ + /* + Chebyshev IRLChebyL2(0.001,15.0,301); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + int cNk=24; + int cNm=36; + int cNstop=24; + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); + + int cNconv; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + IRLL2.calc(eval2,evec2,cc_src,cNconv); + + ConjugateGradient CoarseCoarseCG(0.1,1000); + DeflatedGuesser DeflCoarseCoarseGuesser(evec2,eval2); + NormalEquations DeflCoarseCoarseCGNE(L2Op,CoarseCoarseCG,DeflCoarseCoarseGuesser); + */ + + /* + std::cout< IRLHermOp(LDOp); + // Chebyshev IRLCheby(0.001,15.0,301); + Chebyshev IRLCheby(0.03,12.0,101); + FunctionHermOp IRLOpCheby(IRLCheby,IRLHermOp); + PlainHermOp IRLOp (IRLHermOp); + int Nk=64; + int Nm=128; + int Nstop=Nk; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + IRL.calc(eval,evec,c_src,Nconv); + */ + CoarseVector c_src(Coarse5d); c_src=1.0; + // DeflatedGuesser DeflCoarseGuesser(evec,eval); + // NormalEquations DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser); + + std::cout< , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + ChebyshevSmoother FineSmoother(0.25,60.0,12,HermIndefOp,Ddwf); + /* + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + ChebyshevSmoother CoarseSmoother(0.1,15.0,3,L1LinOp,LDOp); + + // MirsSmoother CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp); + // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); + + CoarseMG Level2Precon (CoarseAggregates, L2Op, + L1LinOp,LDOp, + CoarseSmoother, + DeflCoarseCoarseGuesser, + DeflCoarseCoarseCGNE); + Level2Precon.Level(2); + + // PGCR Applying this solver to solve the coarse space problem + PrecGeneralisedConjugateResidual l2PGCR(0.1, 100, L1LinOp,Level2Precon,16,16); + l2PGCR.Level(2); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + ZeroGuesser CoarseZeroGuesser; + ThreeLevelMG ThreeLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + l2PGCR); + ThreeLevelPrecon.Level(1); + + // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,1000,HermIndefOp,ThreeLevelPrecon,16,16); + l1PGCR.Level(1); + */ + std::cout< CoarseZeroGuesser; + ConjugateGradient CoarseCG(0.01,1000); + NormalEquations CoarseCGNE(LDOp,CoarseCG,CoarseZeroGuesser); + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + CoarseCGNE); + TwoLevelPrecon.Level(1); + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + l1PGCR(src,result); + + std::cout< pCG(1.0e-8,60000); + result=Zero(); + // pCG(HermDefOp,src,result); + + std::cout< HermOpEO(Ddwf); + pCG(HermOpEO,src_o,result_o); + + std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + // std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); + + std::cout< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +using namespace std; +using namespace Grid; + +// TODO +// +// Coarse Grid axpby_ssp_pminus // Inherit from spProj5pm +// Coarse Grid axpby_ssp_pplus + +template +class CayleyBase : public SparseMatrixBase +{ +public: + int Ls; + // protected: + RealD mass; + RealD M5; + // Save arguments to SetCoefficientsInternal + Vector _gamma; + RealD _zolo_hi; + RealD _b; + RealD _c; + + // Cayley form Moebius (tanh and zolotarev) + Vector omega; + Vector bs; // S dependent coeffs + Vector cs; + Vector as; + // For preconditioning Cayley form + Vector bee; + Vector cee; + Vector aee; + Vector beo; + Vector ceo; + Vector aeo; + // LDU factorisation of the eeoo matrix + Vector lee; + Vector leem; + Vector uee; + Vector ueem; + Vector dee; +public: + CayleyBase(RealD _M5, RealD _mass, int _Ls, RealD b_, RealD c_) : + M5(_M5), + mass(_mass), + Ls(_Ls), + _b(b_), + _c(c_) + { + RealD eps = 1.0; + Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham + this->SetCoefficientsTanh(zdata,1.0,0.0); + Approx::zolotarev_free(zdata); + } + ///////////////////////////////////////////////////////// + // Replicates functionality + // Use a common base class approach + ///////////////////////////////////////////////////////// + // Tanh + void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) + { + Vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(1.0,gamma,b,c); + } + //Zolo + void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) + { + Vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(zolo_hi,gamma,b,c); + } + //Zolo + void SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c) + { + int Ls=this->Ls; + + /////////////////////////////////////////////////////////// + // The Cayley coeffs (unprec) + /////////////////////////////////////////////////////////// + assert(gamma.size()==Ls); + + omega.resize(Ls); + bs.resize(Ls); + cs.resize(Ls); + as.resize(Ls); + + double bpc = b+c; + double bmc = b-c; + _b = b; + _c = c; + _gamma = gamma; // Save the parameters so we can change mass later. + _zolo_hi= zolo_hi; + for(int i=0; i < Ls; i++){ + as[i] = 1.0; + omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code + assert(omega[i]!=Coeff_t(0.0)); + bs[i] = 0.5*(bpc/omega[i] + bmc); + cs[i] = 0.5*(bpc/omega[i] - bmc); + } + + //////////////////////////////////////////////////////// + // Constants for the preconditioned matrix Cayley form + //////////////////////////////////////////////////////// + bee.resize(Ls); + cee.resize(Ls); + beo.resize(Ls); + ceo.resize(Ls); + + for(int i=0;iM5) +1.0); + assert(bee[i]!=Coeff_t(0.0)); + cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); + beo[i]=as[i]*bs[i]; + ceo[i]=-as[i]*cs[i]; + } + aee.resize(Ls); + aeo.resize(Ls); + for(int i=0;i &out){assert(0);}; + virtual void DW (const Field &psi, Field &chi)=0; + virtual void DWDag (const Field &psi, Field &chi)=0; + + void M (const Field &psi, Field &chi) + { + Field Din(psi.Grid()); + Meooe5D(psi,Din); + DW(Din,chi); + axpby(chi,1.0,1.0,chi,psi); + M5D(psi,chi); + } + void Mdag (const Field &psi, Field &chi) + { + Field Din(psi.Grid()); + DWDag(psi,Din); + MeooeDag5D(Din,chi); + M5Ddag(psi,chi); + axpby (chi,1.0,1.0,chi,psi); + } + ///////////////////////////////// + // P and Pdag - might be needed + ///////////////////////////////// + void P(const Field &psi, Field &chi) + { + int Ls= this->Ls; + chi=Zero(); + for(int s=0;sLs; + chi=Zero(); + for(int s=0;sLs; + Vector diag (Ls,1.0); + Vector upper(Ls,-1.0); upper[Ls-1]=mass; + Vector lower(Ls,-1.0); lower[0] =mass; + M5D(psi,chi,chi,lower,diag,upper); + } + void M5Ddag (const Field &psi, Field &chi) + { + int Ls=this->Ls; + Vector diag(Ls,1.0); + Vector upper(Ls,-1.0); + Vector lower(Ls,-1.0); + upper[Ls-1]=-mass*upper[Ls-1]; + lower[0] =-mass*lower[0]; + M5Ddag(psi,chi,chi,lower,diag,upper); + } + void Meooe5D (const Field &psi, Field &Din) + { + int Ls=this->Ls; + Vector diag = bs; + Vector upper= cs; + Vector lower= cs; + upper[Ls-1]=-mass*upper[Ls-1]; + lower[0] =-mass*lower[0]; + M5D(psi,psi,Din,lower,diag,upper); + } + void MeooeDag5D (const Field &psi, Field &Din) + { + int Ls=this->Ls; + Vector diag =bs; + Vector upper=cs; + Vector lower=cs; + + for (int s=0;s &lower, + Vector &diag, + Vector &upper) + { + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); + assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + + int Ls =this->Ls; + + // 10 = 3 complex mult + 2 complex add + // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) + uint64_t nloop = grid->oSites()/Ls; + + const int Nsimd = Field::vector_type::Nsimd(); + accelerator_for(sss,nloop,Nsimd,{ + uint64_t ss= sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1, tmp2; + for(int s=0;s &lower, + Vector &diag, + Vector &upper) + { + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); + assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + + int Ls=this->Ls; + + uint64_t nloop = grid->oSites()/Ls; + const int Nsimd = Field::vector_type::Nsimd(); + accelerator_for(sss,nloop,Nsimd,{ + uint64_t ss=sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1,tmp2; + for(int s=0;s +class CoarseCayleyFermion : public CayleyBase< Lattice > , ComplexD > +{ +public: + typedef iVector siteVector; + typedef Lattice CoarseComplexField; + typedef Lattice CoarseVector; + typedef Lattice > CoarseMatrix; + typedef iMatrix Cobj; + typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + // Similar to the CoarseOperator but add 5D support. + Geometry geom; + GridBase *Coarse5D; + GridBase *Coarse4D; + CartesianStencil Stencil; + CoarsenedMatrix &Dw; + + GridBase * Grid(void) { return Coarse5D; }; // this is all the linalg routines need to know + + CoarseCayleyFermion(GridCartesian &CoarseGrid4, + GridCartesian &CoarseGrid5, + CoarsenedMatrix &_Dw, + RealD M5, RealD mass, int Ls, RealD b, RealD c) : + CayleyBase(M5,mass,Ls,b,c), + Coarse4D(&CoarseGrid4), + Coarse5D(&CoarseGrid5), + Dw(_Dw), + geom(CoarseGrid5._ndimension), + Stencil( &CoarseGrid5,geom.npoint,Even,geom.directions,geom.displacements,0) + { + }; + +public: + void Project( CoarseVector &C ) + { + const int Nsimd = CComplex::Nsimd(); + autoView(Cv,C, AcceleratorWrite); + int Ls = this->Ls; + for(int s=0;soSites(), Nsimd, { + int sF= sU*Ls+s; + auto tmp = coalescedRead(Cv[sF]); + coalescedWrite(Cv[sF],tmp); + }); + } + } + //////////////////////////////////////////////// + // This is specific to Coarse Grid Cayley + //////////////////////////////////////////////// + virtual void Mdiag (const CoarseVector &in, CoarseVector &out) + { + std::vector allout(9,in.Grid()); + this->MdirAll(in,allout); + out = allout[8]; + } + virtual void Mdir (const CoarseVector &in, CoarseVector &out,int dir, int disp) + { + assert(0); + } + virtual void MdirAll (const CoarseVector &in, std::vector &out) + { + conformable(Coarse5D,in.Grid()); + + SimpleCompressor compressor; + + Stencil.HaloExchange(in,compressor); + typedef LatticeView Aview; + + const int Nsimd = CComplex::Nsimd(); + + // Ls loop for2D + int Ls=this->Ls; + + siteVector *CBp=Stencil.CommBuf(); + + int ptype; + int nb2=nbasis/2; + + autoView(in_v , in, AcceleratorRead); + autoView(st, Stencil, AcceleratorRead); + for(int point=0;pointoSites(), b, nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int sU = sF/Ls; + int s = sF%Ls; + + calcComplex res = Zero(); + calcVector nbr; + int ptype; + + StencilEntry *SE=st.GetEntry(ptype,point,sF); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(CBp[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb compressor; + + Stencil.HaloExchange(in,compressor); + typedef LatticeView Aview; + + const int Nsimd = CComplex::Nsimd(); + + // Ls loop for2D + int Ls=this->Ls; + + Vector AcceleratorViewContainer; + for(int p=0;poSites(), b, nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int sU = sF/Ls; + int s = sF%Ls; + + calcComplex res = Zero(); + + { + calcVector nbr; + int ptype; + + for(int point=0;point_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(CBp[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb Aggregates; + + void PromoteFromSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) + { + auto FineGrid4 = _Aggregates.FineGrid; + FineField F4(FineGrid4); + CoarseVector C4(Coarse4D); + for(int s=0;sLs;s++){ + ExtractSlice(C4,C,s,0); + _Aggregates.PromoteFromSubspace(C4,F4); + InsertSlice(F4,F,s,0); + } + } + void ProjectToSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) + { + auto FineGrid4 = _Aggregates.FineGrid; + FineField F4(FineGrid4); + CoarseVector C4(Coarse4D); + for(int s=0;sLs;s++){ + ExtractSlice(F4,F,s,0); + _Aggregates.ProjectToSubspace (C4,F4); + InsertSlice(C4,C,s,0); + } + Project(C); + } + template + void Test(Aggregates &_Aggregates,GridBase *FineGrid, Ddwf &_Ddwf) + { + typedef Lattice FineField; + CoarseVector Cin(Coarse5D); + CoarseVector Cout(Coarse5D); + CoarseVector CFout(Coarse5D); + + FineField Fin(FineGrid); + FineField Fout(FineGrid); + + + std::vector seeds({1,2,3,4,5}); + GridParallelRNG RNG(Coarse5D); RNG.SeedFixedIntegers(seeds); + + gaussian(RNG,Cin); + PromoteFromSubspace(_Aggregates,Cin,Fin); + ProjectToSubspace(_Aggregates,Cin,Fin); + + std::cout << GridLogMessage<< "************ "<M(Cin,Cout); + this->Project(Cout); + std::cout << GridLogMessage<< " Cout "<Mdag(Cin,Cout); + this->Project(Cout); + std::cout << GridLogMessage<< " Cout "< Directions(void) { return geom.directions;}; + virtual std::vector Displacements(void){ return geom.displacements;}; +}; + + +template class SolverWrapper : public LinearFunction { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _Solver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(LinearOperatorBase &Matrix, + OperatorFunction &Solver, + LinearFunction &Guess) + : _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + _Guess(in,out); + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +// Must use a non-hermitian solver +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + virtual std::vector Directions(void) { return _Mat.Directions();}; + virtual std::vector Displacements(void){ return _Mat.Displacements();}; + + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + assert(0); + } + void HermOp(const Field &in, Field &out){ + assert(0); + } +}; + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template +class MGPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + typedef CoarseCayleyFermion CoarseOperator; + // typedef SparseMatrixBase CoarseOperator; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _PreSmoother; + FineSmoother & _PostSmoother; + CoarseOperator & _CoarseOperator; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + MGPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &PreSmoother, + FineSmoother &PostSmoother, + CoarseOperator &CoarseOperator_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _PreSmoother(PreSmoother), + _PostSmoother(PostSmoother), + _CoarseOperator(CoarseOperator_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _CoarseOperator.Grid(); + CoarseVector Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + std::cout< +class HDCRPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + //typedef CoarseCayleyFermion CoarseOperator; + typedef SparseMatrixBase CoarseOperator; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _PreSmoother; + FineSmoother & _PostSmoother; + CoarseOperator & _CoarseOperator; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + HDCRPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &PreSmoother, + FineSmoother &PostSmoother, + CoarseOperator &CoarseOperator_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _PreSmoother(PreSmoother), + _PostSmoother(PostSmoother), + _CoarseOperator(CoarseOperator_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _CoarseOperator.Grid(); + CoarseVector Csrc(CoarseGrid); + CoarseVector g5Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + std::cout< block ({2,2,2,2}); // 4,2,2,2 gets worse + std::vector blockc ({1,1,1,1}); + const int nbasis= 24; + const int nbasisc= 32; // decrease, not improvement + + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds({1,2,3,4}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds); + GridParallelRNG CRNG(Coarse4d);CRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField Umu(UGrid); +#if 0 + SU3::TepidConfiguration(RNG4,Umu); + RealD M5=1.0; +#else + std::string file("./ckpoint_lat.1000"); + FieldMetaData header; + NerscIO::readConfiguration(Umu,header,file); + RealD M5=1.8; +#endif + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + + std::cout< MdagM_Dw(Dw_null); + + std::cout< WilsonCG(1.0e-10,40000); + LatticeFermion w_src(UGrid); w_src=1.0; + LatticeFermion w_res(UGrid); + WilsonCG(MdagM_Dw,w_src,w_res); + exit(0); + */ + std::cout< Level1Op4; + typedef CoarseCayleyFermion Level1Op5; + Level1Op4 c_Dw (*Coarse4d,0); + NonHermitianLinearOperator LinOpDw(Dw); + c_Dw.CoarsenOperator(UGrid,LinOpDw,Aggregates4D); // contains the M5 from Dw(-M5) + // c_Dw.Test(Aggregates4D,UGrid,LinOpDw); + + std::cout< MdagM_cDwf(c_Dwf); + + std::cout<,nbasisc> Level2Op; + typedef Aggregation,nbasisc> CoarseSubspace; + CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< L1Hdwf(c_Dwf); + GridRedBlackCartesian * CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d); + Level2Op cc_Dwf (*CoarseCoarse5d,*CoarseCoarse5dRB,1); // say it is hermitian + cc_Dwf.CoarsenOperator(Coarse5d,L1Hdwf,CoarseAggregates); + // cc_Dwf.Test(CoarseAggregates,Coarse5d,L1Hdwf); + + typedef Level2Op::CoarseVector CoarseCoarseVector; + + std::cout< CoarseCG(tol,MaxIt); + ConjugateGradient FineCG(tol,MaxIt); + + NonHermitianLinearOperator FineM(Ddwf); + MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M + + NonHermitianLinearOperator CoarseM(c_Dwf); + MdagMLinearOperator CoarseMdagM(c_Dwf); + + NonHermitianLinearOperator CoarseCoarseM(cc_Dwf); + MdagMLinearOperator CoarseCoarseMdagM(cc_Dwf); + + + std::cout< PM; PM(MdagM_Dw,w_src); + std::cout< cPM; cPM(CoarseMdagM,c_src); + + cc_src=1.0; + PowerMethod ccPM; ccPM(CoarseCoarseMdagM,cc_src); + + std::cout< IRLHermOpL2(cc_Dwf); + Chebyshev IRLChebyL2(IRL_lo,IRL_hi,IRL_ord); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); + + int cNconv; + cNm=0; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + cc_src=1.0; + // IRLL2.calc(eval2,evec2,cc_src,cNconv); + + ConjugateGradient CoarseCoarseCG(0.02,10000); + DeflatedGuesser DeflCoarseCoarseGuesser(evec2,eval2); + NormalEquations DeflCoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,DeflCoarseCoarseGuesser); + + ZeroGuesser CoarseZeroGuesser; + ZeroGuesser CoarseCoarseZeroGuesser; + + std::cout< CoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,CoarseCoarseZeroGuesser); + { +typedef HDCRPreconditioner,nbasisc,NormalEquations > CoarseMG; + typedef MGPreconditioner > ThreeLevelMG; + + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + ChebyshevSmoother CoarseSmoother1(0.5,22.0,12,CoarseM,c_Dwf); // 37s, 26 iter + ChebyshevSmoother CoarseSmoother2(0.5,22.0,12,CoarseM,c_Dwf); + + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,7,CoarseM,c_Dwf); // 38s, 26 iter + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,7,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.4,22.0,7,CoarseM,c_Dwf); // 41s, 27 iter + // ChebyshevSmoother CoarseSmoother2(0.4,22.0,7,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.6,22.0,6,CoarseM,c_Dwf); // 26 iter + // ChebyshevSmoother CoarseSmoother2(0.6,22.0,6,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,5,CoarseM,c_Dwf); // 33 iter, 55s + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,5,CoarseM,c_Dwf); + + + CoarseMG Level2Precon (CoarseAggregates, + CoarseM, + CoarseSmoother1, + CoarseSmoother2, + cc_Dwf, + DeflCoarseCoarseCGNE); + Level2Precon.Level(2); + + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.5, 100, CoarseM,Level2Precon,16,16); // 26 iter, 37s + // PGCR Applying this solver to solve the coarse space problem + // COULD BE FIXED??? + PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); + + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(1.0, 100, CoarseM,Level2Precon,16,16); // 35 iter, 45s + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.6, 100, CoarseM,Level2Precon,16,16); // 26,38 (diifferene is measurement noise) + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.2, 100, CoarseM,Level2Precon,16,16); // 26 iter, 47s + L2PGCR.Level(2); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + + // ChebyshevSmoother FineSmoother1(0.5,60.0,14,FineM,Ddwf); // 26 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,14,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 38s + // ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 23 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,20,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,10,FineM,Ddwf);24 iter, 44s + // ChebyshevSmoother FineSmoother2(0.5,60.0,24,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // odd convergence tail at 10^-9 ish + // ChebyshevSmoother FineSmoother2(0.1,60.0,24,FineM,Ddwf); // 33 iter, waas O(10-9 by 26) + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,18,FineM,Ddwf); // + + ChebyshevSmoother FineSmoother1(0.5,60.0,16,FineM,Ddwf); + ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); // + + // ChebyshevSmoother FineSmoother1(0.5,60.0,11,FineM,Ddwf); // 33 iter, 49s + // ChebyshevSmoother FineSmoother2(0.5,60.0,11,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 26 iter, 37s + // ChebyshevSmoother FineSmoother2(0.5,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.4,60.0,12,FineM,Ddwf); // iter 26 no change in final residual + // ChebyshevSmoother FineSmoother2(0.4,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,12,FineM,Ddwf); // 27 iter 39s. + // ChebyshevSmoother FineSmoother2(0.3,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,13,FineM,Ddwf); // 26 iter, but slower + // ChebyshevSmoother FineSmoother2(0.3,60.0,13,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(1.0,60.0,12,FineM,Ddwf); // 34 iter, slower + // ChebyshevSmoother FineSmoother2(1.0,60.0,12,FineM,Ddwf); + + ThreeLevelMG ThreeLevelPrecon(Aggregates4D, + FineM, + FineSmoother1, + FineSmoother2, + c_Dwf, + L2PGCR); + ThreeLevelPrecon.Level(1); + + PrecGeneralisedConjugateResidualNonHermitian L1PGCR(1.0e-8,1000,FineM,ThreeLevelPrecon,16,16); + L1PGCR.Level(1); + + f_res=Zero(); + L1PGCR(f_src,f_res); + } + + std::cout< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +using namespace std; +using namespace Grid; + +// TODO +// +// Coarse Grid axpby_ssp_pminus // Inherit from spProj5pm +// Coarse Grid axpby_ssp_pplus + +template +class CayleyBase : public SparseMatrixBase +{ +public: + int Ls; + // protected: + RealD mass; + RealD M5; + // Save arguments to SetCoefficientsInternal + Vector _gamma; + RealD _zolo_hi; + RealD _b; + RealD _c; + + // Cayley form Moebius (tanh and zolotarev) + Vector omega; + Vector bs; // S dependent coeffs + Vector cs; + Vector as; + // For preconditioning Cayley form + Vector bee; + Vector cee; + Vector aee; + Vector beo; + Vector ceo; + Vector aeo; + // LDU factorisation of the eeoo matrix + Vector lee; + Vector leem; + Vector uee; + Vector ueem; + Vector dee; +public: + CayleyBase(RealD _M5, RealD _mass, int _Ls, RealD b_, RealD c_) : + M5(_M5), + mass(_mass), + Ls(_Ls), + _b(b_), + _c(c_) + { + RealD eps = 1.0; + Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham + this->SetCoefficientsTanh(zdata,1.0,0.0); + Approx::zolotarev_free(zdata); + } + ///////////////////////////////////////////////////////// + // Replicates functionality + // Use a common base class approach + ///////////////////////////////////////////////////////// + // Tanh + void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) + { + Vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(1.0,gamma,b,c); + } + //Zolo + void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) + { + Vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(zolo_hi,gamma,b,c); + } + //Zolo + void SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c) + { + int Ls=this->Ls; + + /////////////////////////////////////////////////////////// + // The Cayley coeffs (unprec) + /////////////////////////////////////////////////////////// + assert(gamma.size()==Ls); + + omega.resize(Ls); + bs.resize(Ls); + cs.resize(Ls); + as.resize(Ls); + + double bpc = b+c; + double bmc = b-c; + _b = b; + _c = c; + _gamma = gamma; // Save the parameters so we can change mass later. + _zolo_hi= zolo_hi; + for(int i=0; i < Ls; i++){ + as[i] = 1.0; + omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code + assert(omega[i]!=Coeff_t(0.0)); + bs[i] = 0.5*(bpc/omega[i] + bmc); + cs[i] = 0.5*(bpc/omega[i] - bmc); + } + + //////////////////////////////////////////////////////// + // Constants for the preconditioned matrix Cayley form + //////////////////////////////////////////////////////// + bee.resize(Ls); + cee.resize(Ls); + beo.resize(Ls); + ceo.resize(Ls); + + for(int i=0;iM5) +1.0); + assert(bee[i]!=Coeff_t(0.0)); + cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); + beo[i]=as[i]*bs[i]; + ceo[i]=-as[i]*cs[i]; + } + aee.resize(Ls); + aeo.resize(Ls); + for(int i=0;i &out){assert(0);}; + virtual void DW (const Field &psi, Field &chi)=0; + virtual void DWDag (const Field &psi, Field &chi)=0; + + void M (const Field &psi, Field &chi) + { + Field Din(psi.Grid()); + Meooe5D(psi,Din); + DW(Din,chi); + axpby(chi,1.0,1.0,chi,psi); + M5D(psi,chi); + } + void Mdag (const Field &psi, Field &chi) + { + Field Din(psi.Grid()); + DWDag(psi,Din); + MeooeDag5D(Din,chi); + M5Ddag(psi,chi); + axpby (chi,1.0,1.0,chi,psi); + } + ///////////////////////////////// + // P and Pdag - might be needed + ///////////////////////////////// + void P(const Field &psi, Field &chi) + { + int Ls= this->Ls; + chi=Zero(); + for(int s=0;sLs; + chi=Zero(); + for(int s=0;sLs; + Vector diag (Ls,1.0); + Vector upper(Ls,-1.0); upper[Ls-1]=mass; + Vector lower(Ls,-1.0); lower[0] =mass; + M5D(psi,chi,chi,lower,diag,upper); + } + void M5Ddag (const Field &psi, Field &chi) + { + int Ls=this->Ls; + Vector diag(Ls,1.0); + Vector upper(Ls,-1.0); + Vector lower(Ls,-1.0); + upper[Ls-1]=-mass*upper[Ls-1]; + lower[0] =-mass*lower[0]; + M5Ddag(psi,chi,chi,lower,diag,upper); + } + void Meooe5D (const Field &psi, Field &Din) + { + int Ls=this->Ls; + Vector diag = bs; + Vector upper= cs; + Vector lower= cs; + upper[Ls-1]=-mass*upper[Ls-1]; + lower[0] =-mass*lower[0]; + M5D(psi,psi,Din,lower,diag,upper); + } + void MeooeDag5D (const Field &psi, Field &Din) + { + int Ls=this->Ls; + Vector diag =bs; + Vector upper=cs; + Vector lower=cs; + + for (int s=0;s &lower, + Vector &diag, + Vector &upper) + { + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); + assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + + int Ls =this->Ls; + + // 10 = 3 complex mult + 2 complex add + // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) + uint64_t nloop = grid->oSites()/Ls; + + const int Nsimd = Field::vector_type::Nsimd(); + accelerator_for(sss,nloop,Nsimd,{ + uint64_t ss= sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1, tmp2; + for(int s=0;s &lower, + Vector &diag, + Vector &upper) + { + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); + assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + + int Ls=this->Ls; + + uint64_t nloop = grid->oSites()/Ls; + const int Nsimd = Field::vector_type::Nsimd(); + accelerator_for(sss,nloop,Nsimd,{ + uint64_t ss=sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1,tmp2; + for(int s=0;s +class CoarseCayleyFermion : public CayleyBase< Lattice > , ComplexD > +{ +public: + typedef iVector siteVector; + typedef Lattice CoarseComplexField; + typedef Lattice CoarseVector; + typedef Lattice > CoarseMatrix; + typedef iMatrix Cobj; + typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + // Similar to the CoarseOperator but add 5D support. + Geometry geom; + GridBase *Coarse5D; + GridBase *Coarse4D; + CartesianStencil Stencil; + CoarsenedMatrix &Dw; + + GridBase * Grid(void) { return Coarse5D; }; // this is all the linalg routines need to know + + CoarseCayleyFermion(GridCartesian &CoarseGrid4, + GridCartesian &CoarseGrid5, + CoarsenedMatrix &_Dw, + RealD M5, RealD mass, int Ls, RealD b, RealD c) : + CayleyBase(M5,mass,Ls,b,c), + Coarse4D(&CoarseGrid4), + Coarse5D(&CoarseGrid5), + Dw(_Dw), + geom(CoarseGrid5._ndimension), + Stencil( &CoarseGrid5,geom.npoint,Even,geom.directions,geom.displacements,0) + { + }; + +public: + void Project( CoarseVector &C ) + { + const int Nsimd = CComplex::Nsimd(); + autoView(Cv,C, AcceleratorWrite); + int Ls = this->Ls; + for(int s=0;soSites(), Nsimd, { + int sF= sU*Ls+s; + auto tmp = coalescedRead(Cv[sF]); + coalescedWrite(Cv[sF],tmp); + }); + } + } + //////////////////////////////////////////////// + // This is specific to Coarse Grid Cayley + //////////////////////////////////////////////// + virtual void Mdiag (const CoarseVector &in, CoarseVector &out) + { + std::vector allout(9,in.Grid()); + this->MdirAll(in,allout); + out = allout[8]; + } + virtual void Mdir (const CoarseVector &in, CoarseVector &out,int dir, int disp) + { + assert(0); + } + virtual void MdirAll (const CoarseVector &in, std::vector &out) + { + conformable(Coarse5D,in.Grid()); + + SimpleCompressor compressor; + + Stencil.HaloExchange(in,compressor); + typedef LatticeView Aview; + + const int Nsimd = CComplex::Nsimd(); + + // Ls loop for2D + int Ls=this->Ls; + + siteVector *CBp=Stencil.CommBuf(); + + int ptype; + int nb2=nbasis/2; + + autoView(in_v , in, AcceleratorRead); + autoView(st, Stencil, AcceleratorRead); + for(int point=0;pointoSites(), b, nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int sU = sF/Ls; + int s = sF%Ls; + + calcComplex res = Zero(); + calcVector nbr; + int ptype; + + StencilEntry *SE=st.GetEntry(ptype,point,sF); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(CBp[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb compressor; + + Stencil.HaloExchange(in,compressor); + typedef LatticeView Aview; + + const int Nsimd = CComplex::Nsimd(); + + // Ls loop for2D + int Ls=this->Ls; + + Vector AcceleratorViewContainer; + for(int p=0;poSites(), b, nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int sU = sF/Ls; + int s = sF%Ls; + + calcComplex res = Zero(); + + { + calcVector nbr; + int ptype; + + for(int point=0;point_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(CBp[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb Aggregates; + + void PromoteFromSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) + { + auto FineGrid4 = _Aggregates.FineGrid; + FineField F4(FineGrid4); + CoarseVector C4(Coarse4D); + for(int s=0;sLs;s++){ + ExtractSlice(C4,C,s,0); + _Aggregates.PromoteFromSubspace(C4,F4); + InsertSlice(F4,F,s,0); + } + } + void ProjectToSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) + { + auto FineGrid4 = _Aggregates.FineGrid; + FineField F4(FineGrid4); + CoarseVector C4(Coarse4D); + for(int s=0;sLs;s++){ + ExtractSlice(F4,F,s,0); + _Aggregates.ProjectToSubspace (C4,F4); + InsertSlice(C4,C,s,0); + } + Project(C); + } + template + void Test(Aggregates &_Aggregates,GridBase *FineGrid, Ddwf &_Ddwf) + { + typedef Lattice FineField; + CoarseVector Cin(Coarse5D); + CoarseVector Cout(Coarse5D); + CoarseVector CFout(Coarse5D); + + FineField Fin(FineGrid); + FineField Fout(FineGrid); + + + std::vector seeds({1,2,3,4,5}); + GridParallelRNG RNG(Coarse5D); RNG.SeedFixedIntegers(seeds); + + gaussian(RNG,Cin); + PromoteFromSubspace(_Aggregates,Cin,Fin); + ProjectToSubspace(_Aggregates,Cin,Fin); + + std::cout << GridLogMessage<< "************ "<M(Cin,Cout); + this->Project(Cout); + std::cout << GridLogMessage<< " Cout "<Mdag(Cin,Cout); + this->Project(Cout); + std::cout << GridLogMessage<< " Cout "< Directions(void) { return geom.directions;}; + virtual std::vector Displacements(void){ return geom.displacements;}; +}; + +template class SchurSolverWrapper : public LinearFunction { +private: + CheckerBoardedSparseMatrixBase & _Matrix; + SchurRedBlackBase & _Solver; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SchurSolverWrapper(CheckerBoardedSparseMatrixBase &Matrix, + SchurRedBlackBase &Solver) + : _Matrix(Matrix), _Solver(Solver) {}; + + void operator() (const Field &in, Field &out){ + + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +template class SolverWrapper : public LinearFunction { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _Solver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(LinearOperatorBase &Matrix, + OperatorFunction &Solver, + LinearFunction &Guess) + : _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + _Guess(in,out); + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +// Must use a non-hermitian solver +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + virtual std::vector Directions(void) { return _Mat.Directions();}; + virtual std::vector Displacements(void){ return _Mat.Displacements();}; + + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + assert(0); + } + void HermOp(const Field &in, Field &out){ + assert(0); + } +}; + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template +class MGPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + typedef CoarseCayleyFermion CoarseOperator; + // typedef SparseMatrixBase CoarseOperator; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _PreSmoother; + FineSmoother & _PostSmoother; + CoarseOperator & _CoarseOperator; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + MGPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &PreSmoother, + FineSmoother &PostSmoother, + CoarseOperator &CoarseOperator_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _PreSmoother(PreSmoother), + _PostSmoother(PostSmoother), + _CoarseOperator(CoarseOperator_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _CoarseOperator.Grid(); + CoarseVector Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + std::cout< +class HDCRPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + //typedef CoarseCayleyFermion CoarseOperator; + typedef SparseMatrixBase CoarseOperator; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _PreSmoother; + FineSmoother & _PostSmoother; + CoarseOperator & _CoarseOperator; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + HDCRPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &PreSmoother, + FineSmoother &PostSmoother, + CoarseOperator &CoarseOperator_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _PreSmoother(PreSmoother), + _PostSmoother(PostSmoother), + _CoarseOperator(CoarseOperator_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _CoarseOperator.Grid(); + CoarseVector Csrc(CoarseGrid); + CoarseVector g5Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + std::cout< block ({2,2,2,2}); // 4,2,2,2 gets worse + std::vector blockc ({1,1,1,1}); + const int nbasis= 24; + const int nbasisc= 40; // decrease, not improvement + + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds({1,2,3,4}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds); + GridParallelRNG CRNG(Coarse4d);CRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField Umu(UGrid); +#if 0 + SU3::TepidConfiguration(RNG4,Umu); + RealD M5=1.0; +#else + std::string file("./ckpoint_lat.1000"); + FieldMetaData header; + NerscIO::readConfiguration(Umu,header,file); + RealD M5=1.8; +#endif + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + + std::cout< MdagM_Dw(Dw_null); + + std::cout< WilsonCG(1.0e-10,40000); + LatticeFermion w_src(UGrid); w_src=1.0; + LatticeFermion w_res(UGrid); + WilsonCG(MdagM_Dw,w_src,w_res); + exit(0); + */ + std::cout< Level1Op4; + typedef CoarseCayleyFermion Level1Op5; + Level1Op4 c_Dw (*Coarse4d,0); + NonHermitianLinearOperator LinOpDw(Dw); + c_Dw.CoarsenOperator(UGrid,LinOpDw,Aggregates4D); // contains the M5 from Dw(-M5) + // c_Dw.Test(Aggregates4D,UGrid,LinOpDw); + + std::cout< MdagM_cDwf(c_Dwf); + + std::cout<,nbasisc> Level2Op; + typedef Aggregation,nbasisc> CoarseSubspace; + CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< L1Hdwf(c_Dwf); + Level2Op cc_Dwf (*CoarseCoarse5d,*CoarseCoarse5dRB,1); // say it is hermitian + cc_Dwf.CoarsenOperator(Coarse5d,L1Hdwf,CoarseAggregates); + // cc_Dwf.Test(CoarseAggregates,Coarse5d,L1Hdwf); + + typedef Level2Op::CoarseVector CoarseCoarseVector; + + std::cout< CoarseCG(tol,MaxIt); + ConjugateGradient FineCG(tol,MaxIt); + + NonHermitianLinearOperator FineM(Ddwf); + MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M + + NonHermitianLinearOperator CoarseM(c_Dwf); + MdagMLinearOperator CoarseMdagM(c_Dwf); + + NonHermitianLinearOperator CoarseCoarseM(cc_Dwf); + MdagMLinearOperator CoarseCoarseMdagM(cc_Dwf); + + + std::cout< PM; PM(MdagM_Dw,w_src); + std::cout< cPM; cPM(CoarseMdagM,c_src); + + cc_src=1.0; + PowerMethod ccPM; ccPM(CoarseCoarseMdagM,cc_src); + + std::cout< IRLHermOpL2(cc_Dwf); + Chebyshev IRLChebyL2(IRL_lo,IRL_hi,IRL_ord); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); + + int cNconv; + cNm=0; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + cc_src=1.0; + // IRLL2.calc(eval2,evec2,cc_src,cNconv); + + std::vector tols ({0.005,0.001}); + std::vector c_los ({0.1,0.05}); + std::vector c_his ({22.0}); + std::vector f_los ({0.5,0.2}); + std::vector f_his ({60.0}); + std::vector ws ({2,3}); + std::vector c_ords ({32,24}); + std::vector f_ords ({20,16}); + + for(auto w : ws ) { + for(auto tol : tols ) { + for(auto f_ord : f_ords ) { + for(auto c_ord : c_ords ) { + for(auto c_lo : c_los ) { + for(auto c_hi : c_his ) { + for(auto f_lo : f_los ) { + for(auto f_hi : f_his ) { + ZeroGuesser CoarseZeroGuesser; + ZeroGuesser CoarseCoarseZeroGuesser; + ConjugateGradient CoarseCoarseCG(tol,10000); + ZeroGuesser CoarseCoarseGuesser; + SchurRedBlackDiagMooeeSolve CoarseCoarseRBCG(CoarseCoarseCG); + SchurSolverWrapper CoarseCoarseSolver(cc_Dwf,CoarseCoarseRBCG); + + std::cout< CoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,CoarseCoarseZeroGuesser); + { +typedef HDCRPreconditioner,nbasisc,LinearFunction > CoarseMG; + typedef MGPreconditioner > ThreeLevelMG; + + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,c_ord,CoarseM,c_Dwf); // 37s, 26 iter + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,c_ord,CoarseM,c_Dwf); + ChebyshevSmoother CoarseSmoother(c_lo,c_hi,c_ord,CoarseM,c_Dwf); // 37s, 26 iter + + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,7,CoarseM,c_Dwf); // 38s, 26 iter + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,7,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.4,22.0,7,CoarseM,c_Dwf); // 41s, 27 iter + // ChebyshevSmoother CoarseSmoother2(0.4,22.0,7,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.6,22.0,6,CoarseM,c_Dwf); // 26 iter + // ChebyshevSmoother CoarseSmoother2(0.6,22.0,6,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,5,CoarseM,c_Dwf); // 33 iter, 55s + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,5,CoarseM,c_Dwf); + + + CoarseMG Level2Precon (CoarseAggregates, + CoarseM, + CoarseSmoother, + CoarseSmoother, + cc_Dwf, + CoarseCoarseSolver); + Level2Precon.Level(2); + + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.5, 100, CoarseM,Level2Precon,16,16); // 26 iter, 37s + // PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); // 296 s, 50 iter + // PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); // 250 s, 37 iter + PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); + + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(1.0, 100, CoarseM,Level2Precon,16,16); // 35 iter, 45s + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.6, 100, CoarseM,Level2Precon,16,16); // 26,38 (diifferene is measurement noise) + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.2, 100, CoarseM,Level2Precon,16,16); // 26 iter, 47s + L2PGCR.Level(2); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + + // ChebyshevSmoother FineSmoother1(0.5,60.0,14,FineM,Ddwf); // 26 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,14,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 38s + // ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 23 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,20,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,10,FineM,Ddwf);24 iter, 44s + // ChebyshevSmoother FineSmoother2(0.5,60.0,24,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // odd convergence tail at 10^-9 ish + // ChebyshevSmoother FineSmoother2(0.1,60.0,24,FineM,Ddwf); // 33 iter, waas O(10-9 by 26) + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,18,FineM,Ddwf); // + + ChebyshevSmoother FineSmoother(f_lo,f_hi,f_ord,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,11,FineM,Ddwf); // 33 iter, 49s + // ChebyshevSmoother FineSmoother2(0.5,60.0,11,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 26 iter, 37s + // ChebyshevSmoother FineSmoother2(0.5,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.4,60.0,12,FineM,Ddwf); // iter 26 no change in final residual + // ChebyshevSmoother FineSmoother2(0.4,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,12,FineM,Ddwf); // 27 iter 39s. + // ChebyshevSmoother FineSmoother2(0.3,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,13,FineM,Ddwf); // 26 iter, but slower + // ChebyshevSmoother FineSmoother2(0.3,60.0,13,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(1.0,60.0,12,FineM,Ddwf); // 34 iter, slower + // ChebyshevSmoother FineSmoother2(1.0,60.0,12,FineM,Ddwf); + + ThreeLevelMG ThreeLevelPrecon(Aggregates4D, + FineM, + FineSmoother, + FineSmoother, + c_Dwf, + L2PGCR); + ThreeLevelPrecon.Level(1); + + PrecGeneralisedConjugateResidualNonHermitian L1PGCR(1.0e-8,1000,FineM,ThreeLevelPrecon,16,16); + L1PGCR.Level(1); + + f_res=Zero(); + L1PGCR(f_src,f_res); + } + }}}} + }}} + } + std::cout< Date: Thu, 14 Jan 2021 21:00:36 -0500 Subject: [PATCH 14/16] Gparity fix, and plaquette IO --- Grid/parallelIO/IldgIO.h | 22 +++--- Grid/parallelIO/MetaData.h | 34 +++----- Grid/parallelIO/NerscIO.h | 46 +++++------ Grid/parallelIO/OpenQcdIO.h | 2 +- Grid/parallelIO/OpenQcdIOChromaReference.h | 2 +- Grid/qcd/action/gauge/Gauge.cc | 38 +++++++++ Grid/qcd/action/gauge/GaugeImplementations.h | 79 +++++++++++-------- Grid/qcd/hmc/checkpointers/BaseCheckpointer.h | 3 +- Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h | 5 +- .../qcd/hmc/checkpointers/NerscCheckpointer.h | 7 +- Grid/qcd/modules/Modules.h | 2 +- Grid/qcd/utils/CovariantCshift.h | 51 ++++++++++++ Grid/tensors/Tensor_Ta.h | 14 +++- tests/core/Test_reunitarise.cc | 3 +- tests/hmc/Test_hmc_EODWFRatio_Gparity.cc | 7 +- tests/hmc/Test_hmc_GparityIwasakiGauge.cc | 4 + tests/hmc/Test_hmc_GparityWilsonGauge.cc | 3 + 17 files changed, 220 insertions(+), 102 deletions(-) create mode 100644 Grid/qcd/action/gauge/Gauge.cc diff --git a/Grid/parallelIO/IldgIO.h b/Grid/parallelIO/IldgIO.h index b564371b..ef42c159 100644 --- a/Grid/parallelIO/IldgIO.h +++ b/Grid/parallelIO/IldgIO.h @@ -123,7 +123,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5); //////////////////////////////////////////////////////////// // Helper to fill out metadata //////////////////////////////////////////////////////////// - template void ScidacMetaData(Lattice & field, +template void ScidacMetaData(Lattice & field, FieldMetaData &header, scidacRecord & _scidacRecord, scidacFile & _scidacFile) @@ -619,12 +619,12 @@ class IldgWriter : public ScidacWriter { // Don't require scidac records EXCEPT checksum // Use Grid MetaData object if present. //////////////////////////////////////////////////////////////// - template - void writeConfiguration(Lattice > &Umu,int sequence,std::string LFN,std::string description) + template + void writeConfiguration(Lattice &Umu,int sequence,std::string LFN,std::string description) { GridBase * grid = Umu.Grid(); - typedef Lattice > GaugeField; - typedef iLorentzColourMatrix vobj; + typedef Lattice GaugeField; + typedef vLorentzColourMatrixD vobj; typedef typename vobj::scalar_object sobj; //////////////////////////////////////// @@ -636,6 +636,9 @@ class IldgWriter : public ScidacWriter { ScidacMetaData(Umu,header,_scidacRecord,_scidacFile); + stats Stats; + Stats(Umu,header); + std::string format = header.floating_point; header.ensemble_id = description; header.ensemble_label = description; @@ -705,10 +708,10 @@ class IldgReader : public GridLimeReader { // Else use ILDG MetaData object if present. // Else use SciDAC MetaData object if present. //////////////////////////////////////////////////////////////// - template - void readConfiguration(Lattice > &Umu, FieldMetaData &FieldMetaData_) { + template + void readConfiguration(Lattice &Umu, FieldMetaData &FieldMetaData_) { - typedef Lattice > GaugeField; + typedef Lattice GaugeField; typedef typename GaugeField::vector_object vobj; typedef typename vobj::scalar_object sobj; @@ -921,7 +924,8 @@ class IldgReader : public GridLimeReader { if ( found_FieldMetaData || found_usqcdInfo ) { FieldMetaData checker; - GaugeStatistics(Umu,checker); + stats Stats; + Stats(Umu,checker); assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5); assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5); std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl; diff --git a/Grid/parallelIO/MetaData.h b/Grid/parallelIO/MetaData.h index 4c1cfbdb..d30ba523 100644 --- a/Grid/parallelIO/MetaData.h +++ b/Grid/parallelIO/MetaData.h @@ -176,29 +176,18 @@ template inline void PrepareMetaData(Lattice & field, FieldMet GridMetaData(grid,header); MachineCharacteristics(header); } -inline void GaugeStatistics(Lattice & data,FieldMetaData &header) +template +class GaugeStatistics { - // How to convert data precision etc... - header.link_trace=WilsonLoops::linkTrace(data); - header.plaquette =WilsonLoops::avgPlaquette(data); -} -inline void GaugeStatistics(Lattice & data,FieldMetaData &header) -{ - // How to convert data precision etc... - header.link_trace=WilsonLoops::linkTrace(data); - header.plaquette =WilsonLoops::avgPlaquette(data); -} -template<> inline void PrepareMetaData(Lattice & field, FieldMetaData &header) -{ - - GridBase *grid = field.Grid(); - std::string format = getFormatString(); - header.floating_point = format; - header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac - GridMetaData(grid,header); - GaugeStatistics(field,header); - MachineCharacteristics(header); -} +public: + void operator()(Lattice & data,FieldMetaData &header) + { + header.link_trace=WilsonLoops::linkTrace(data); + header.plaquette =WilsonLoops::avgPlaquette(data); + } +}; +typedef GaugeStatistics PeriodicGaugeStatistics; +typedef GaugeStatistics ConjugateGaugeStatistics; template<> inline void PrepareMetaData(Lattice & field, FieldMetaData &header) { GridBase *grid = field.Grid(); @@ -206,7 +195,6 @@ template<> inline void PrepareMetaData(Lattice GaugeField; + static inline void truncate(std::string file){ std::ofstream fout(file,std::ios::out); } @@ -129,12 +131,12 @@ public: // Now the meat: the object readers ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - template - static inline void readConfiguration(Lattice > &Umu, + template + static inline void readConfiguration(GaugeField &Umu, FieldMetaData& header, - std::string file) + std::string file, + GaugeStats GaugeStatisticsCalculator=GaugeStats()) { - typedef Lattice > GaugeField; GridBase *grid = Umu.Grid(); uint64_t offset = readHeader(file,Umu.Grid(),header); @@ -153,23 +155,23 @@ public: // munger is a function of if ( header.data_type == std::string("4D_SU3_GAUGE") ) { if ( ieee32 || ieee32big ) { - BinaryIO::readLatticeObject, LorentzColour2x3F> + BinaryIO::readLatticeObject (Umu,file,Gauge3x2munger(), offset,format, nersc_csum,scidac_csuma,scidac_csumb); } if ( ieee64 || ieee64big ) { - BinaryIO::readLatticeObject, LorentzColour2x3D> + BinaryIO::readLatticeObject (Umu,file,Gauge3x2munger(),offset,format, nersc_csum,scidac_csuma,scidac_csumb); } } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { if ( ieee32 || ieee32big ) { - BinaryIO::readLatticeObject,LorentzColourMatrixF> + BinaryIO::readLatticeObject (Umu,file,GaugeSimpleMunger(),offset,format, nersc_csum,scidac_csuma,scidac_csumb); } if ( ieee64 || ieee64big ) { - BinaryIO::readLatticeObject,LorentzColourMatrixD> + BinaryIO::readLatticeObject (Umu,file,GaugeSimpleMunger(),offset,format, nersc_csum,scidac_csuma,scidac_csumb); } @@ -177,7 +179,7 @@ public: assert(0); } - GaugeStatistics(Umu,clone); + GaugeStats Stats; Stats(Umu,clone); std::cout< - static inline void writeConfiguration(Lattice > &Umu, + template + static inline void writeConfiguration(Lattice &Umu, std::string file, int two_row, int bits32) { - typedef Lattice > GaugeField; - - typedef iLorentzColourMatrix vobj; + typedef vLorentzColourMatrixD vobj; typedef typename vobj::scalar_object sobj; FieldMetaData header; @@ -229,7 +229,7 @@ public: GridMetaData(grid,header); assert(header.nd==4); - GaugeStatistics(Umu,header); + GaugeStats Stats; Stats(Umu,header); MachineCharacteristics(header); uint64_t offset; @@ -238,19 +238,19 @@ public: header.floating_point = std::string("IEEE64BIG"); header.data_type = std::string("4D_SU3_GAUGE_3x3"); GaugeSimpleUnmunger munge; - if ( grid->IsBoss() ) { - truncate(file); - offset = writeHeader(header,file); - } - grid->Broadcast(0,(void *)&offset,sizeof(offset)); + if ( grid->IsBoss() ) { + truncate(file); + offset = writeHeader(header,file); + } + grid->Broadcast(0,(void *)&offset,sizeof(offset)); uint32_t nersc_csum,scidac_csuma,scidac_csumb; BinaryIO::writeLatticeObject(Umu,file,munge,offset,header.floating_point, nersc_csum,scidac_csuma,scidac_csumb); header.checksum = nersc_csum; - if ( grid->IsBoss() ) { - writeHeader(header,file); - } + if ( grid->IsBoss() ) { + writeHeader(header,file); + } std::cout<Barrier(); timer.Stop(); std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl; - GaugeStatistics(Umu, clone); + PeriodicGaugeStatistics Stats; Stats(Umu, clone); RealD plaq_diff = fabs(clone.plaquette - header.plaquette); diff --git a/Grid/parallelIO/OpenQcdIOChromaReference.h b/Grid/parallelIO/OpenQcdIOChromaReference.h index bab54fe8..886536ad 100644 --- a/Grid/parallelIO/OpenQcdIOChromaReference.h +++ b/Grid/parallelIO/OpenQcdIOChromaReference.h @@ -208,7 +208,7 @@ public: FieldMetaData clone(header); - GaugeStatistics(Umu, clone); + PeriodicGaugeStatistics Stats; Stats(Umu, clone); RealD plaq_diff = fabs(clone.plaquette - header.plaquette); diff --git a/Grid/qcd/action/gauge/Gauge.cc b/Grid/qcd/action/gauge/Gauge.cc new file mode 100644 index 00000000..2b5e2691 --- /dev/null +++ b/Grid/qcd/action/gauge/Gauge.cc @@ -0,0 +1,38 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/gauge/Gauge.cc + +Copyright (C) 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); + +std::vector ConjugateGaugeImplBase::_conjDirs; + +NAMESPACE_END(Grid); + diff --git a/Grid/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h index a14aec1b..16147c77 100644 --- a/Grid/qcd/action/gauge/GaugeImplementations.h +++ b/Grid/qcd/action/gauge/GaugeImplementations.h @@ -59,14 +59,14 @@ public: } static inline GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { - return Cshift(adj(Link), mu, -1); + return PeriodicBC::CovShiftIdentityBackward(Link, mu); } static inline GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { - return Link; + return PeriodicBC::CovShiftIdentityForward(Link,mu); } static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) { - return Cshift(Link, mu, 1); + return PeriodicBC::ShiftStaple(Link,mu); } static inline bool isPeriodicGaugeField(void) { return true; } @@ -74,7 +74,13 @@ public: // Composition with smeared link, bc's etc.. probably need multiple inheritance // Variable precision "S" and variable Nc -template class ConjugateGaugeImpl : public GimplTypes { +class ConjugateGaugeImplBase { +protected: + static std::vector _conjDirs; +}; + + template class ConjugateGaugeImpl : public GimplTypes, ConjugateGaugeImplBase { +private: public: INHERIT_GIMPL_TYPES(GimplTypes); @@ -84,47 +90,56 @@ public: //////////////////////////////////////////////////////////////////////////////////////////////////////////// template static Lattice CovShiftForward(const GaugeLinkField &Link, int mu, - const Lattice &field) { - return ConjugateBC::CovShiftForward(Link, mu, field); + const Lattice &field) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CovShiftForward(Link, mu, field); + else + return PeriodicBC::CovShiftForward(Link, mu, field); } template static Lattice CovShiftBackward(const GaugeLinkField &Link, int mu, - const Lattice &field) { - return ConjugateBC::CovShiftBackward(Link, mu, field); + const Lattice &field) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CovShiftBackward(Link, mu, field); + else + return PeriodicBC::CovShiftBackward(Link, mu, field); } static inline GaugeLinkField - CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { - GridBase *grid = Link.Grid(); - int Lmu = grid->GlobalDimensions()[mu] - 1; - - Lattice> coor(grid); - LatticeCoordinate(coor, mu); - - GaugeLinkField tmp(grid); - tmp = adj(Link); - tmp = where(coor == Lmu, conjugate(tmp), tmp); - return Cshift(tmp, mu, -1); // moves towards positive mu + CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CovShiftIdentityBackward(Link, mu); + else + return PeriodicBC::CovShiftIdentityBackward(Link, mu); } static inline GaugeLinkField - CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { - return Link; + CovShiftIdentityForward(const GaugeLinkField &Link, int mu) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CovShiftIdentityForward(Link,mu); + else + return PeriodicBC::CovShiftIdentityForward(Link,mu); } - static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) { - GridBase *grid = Link.Grid(); - int Lmu = grid->GlobalDimensions()[mu] - 1; - - Lattice> coor(grid); - LatticeCoordinate(coor, mu); - - GaugeLinkField tmp(grid); - tmp = Cshift(Link, mu, 1); - tmp = where(coor == Lmu, conjugate(tmp), tmp); - return tmp; + static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::ShiftStaple(Link,mu); + else + return PeriodicBC::ShiftStaple(Link,mu); } + static inline void setDirections(std::vector &conjDirs) { _conjDirs=conjDirs; } + static inline std::vector getDirections(void) { return _conjDirs; } static inline bool isPeriodicGaugeField(void) { return false; } }; diff --git a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h index 3cd05ebc..c09fdeeb 100644 --- a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h +++ b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h @@ -74,7 +74,7 @@ public: conf_file = os.str(); } } - + virtual ~BaseHmcCheckpointer(){}; void check_filename(const std::string &filename){ std::ifstream f(filename.c_str()); if(!f.good()){ @@ -82,7 +82,6 @@ public: abort(); }; } - virtual void initialize(const CheckpointerParameters &Params) = 0; virtual void CheckpointRestore(int traj, typename Impl::Field &U, diff --git a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h index 269caa6e..1bb8aa1a 100644 --- a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h +++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h @@ -45,6 +45,7 @@ private: public: INHERIT_GIMPL_TYPES(Implementation); + typedef GaugeStatistics GaugeStats; ILDGHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); } @@ -78,7 +79,7 @@ public: BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); IldgWriter _IldgWriter(grid->IsBoss()); _IldgWriter.open(config); - _IldgWriter.writeConfiguration(U, traj, config, config); + _IldgWriter.writeConfiguration(U, traj, config, config); _IldgWriter.close(); std::cout << GridLogMessage << "Written ILDG Configuration on " << config @@ -105,7 +106,7 @@ public: FieldMetaData header; IldgReader _IldgReader; _IldgReader.open(config); - _IldgReader.readConfiguration(U,header); // format from the header + _IldgReader.readConfiguration(U,header); // format from the header _IldgReader.close(); std::cout << GridLogMessage << "Read ILDG Configuration from " << config diff --git a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h index cfcc44d8..4534e4c4 100644 --- a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h +++ b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h @@ -43,7 +43,8 @@ private: public: INHERIT_GIMPL_TYPES(Gimpl); // only for gauge configurations - + typedef GaugeStatistics GaugeStats; + NerscHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); } void initialize(const CheckpointerParameters &Params_) { @@ -60,7 +61,7 @@ public: int precision32 = 1; int tworow = 0; NerscIO::writeRNGState(sRNG, pRNG, rng); - NerscIO::writeConfiguration(U, config, tworow, precision32); + NerscIO::writeConfiguration(U, config, tworow, precision32); } }; @@ -74,7 +75,7 @@ public: FieldMetaData header; NerscIO::readRNGState(sRNG, pRNG, header, rng); - NerscIO::readConfiguration(U, header, config); + NerscIO::readConfiguration(U, header, config); }; }; diff --git a/Grid/qcd/modules/Modules.h b/Grid/qcd/modules/Modules.h index 1c1c8889..7aa3f0ac 100644 --- a/Grid/qcd/modules/Modules.h +++ b/Grid/qcd/modules/Modules.h @@ -99,7 +99,7 @@ public: virtual Prod* getPtr() = 0; // add a getReference? - + virtual ~HMCModuleBase(){}; virtual void print_parameters(){}; // default to nothing }; diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h index cee1fa12..6c70706f 100644 --- a/Grid/qcd/utils/CovariantCshift.h +++ b/Grid/qcd/utils/CovariantCshift.h @@ -53,6 +53,24 @@ namespace PeriodicBC { return Cshift(tmp,mu,-1);// moves towards positive mu } + template Lattice + CovShiftIdentityBackward(const Lattice &Link, int mu) + { + return Cshift(adj(Link), mu, -1); + } + + template Lattice + CovShiftIdentityForward(const Lattice &Link, int mu) + { + return Link; + } + + template Lattice + ShiftStaple(const Lattice &Link, int mu) + { + return Cshift(Link, mu, 1); + } + template::value,void>::type * = nullptr> auto CovShiftForward(const Lattice &Link, int mu, @@ -70,6 +88,7 @@ namespace PeriodicBC { return CovShiftBackward(Link,mu,arg); } + } @@ -139,6 +158,38 @@ namespace ConjugateBC { // std::cout<<"Gparity::CovCshiftBackward mu="< Lattice + CovShiftIdentityBackward(const Lattice &Link, int mu) { + GridBase *grid = Link.Grid(); + int Lmu = grid->GlobalDimensions()[mu] - 1; + + Lattice> coor(grid); + LatticeCoordinate(coor, mu); + + Lattice tmp(grid); + tmp = adj(Link); + tmp = where(coor == Lmu, conjugate(tmp), tmp); + return Cshift(tmp, mu, -1); // moves towards positive mu + } + template Lattice + CovShiftIdentityForward(const Lattice &Link, int mu) { + return Link; + } + + template Lattice + ShiftStaple(const Lattice &Link, int mu) + { + GridBase *grid = Link.Grid(); + int Lmu = grid->GlobalDimensions()[mu] - 1; + + Lattice> coor(grid); + LatticeCoordinate(coor, mu); + + Lattice tmp(grid); + tmp = Cshift(Link, mu, 1); + tmp = where(coor == Lmu, conjugate(tmp), tmp); + return tmp; + } template::value,void>::type * = nullptr> auto CovShiftForward(const Lattice &Link, diff --git a/Grid/tensors/Tensor_Ta.h b/Grid/tensors/Tensor_Ta.h index bbaa4a00..90e57b2b 100644 --- a/Grid/tensors/Tensor_Ta.h +++ b/Grid/tensors/Tensor_Ta.h @@ -117,7 +117,19 @@ accelerator_inline iMatrix ProjectOnGroup(const iMatrix &arg) ret._internal[b][c] -= pr * ret._internal[c1][c]; } } - + } + + // Normalise last row + { + int c1 = N-1; + zeroit(inner); + for(int c2=0;c2(U,Nc-1,i); element = element * phase; PokeIndex(U,element,Nc-1,i); - } + } + U=U*0.1; UU=U; detU= Determinant(U) ; diff --git a/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc b/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc index 3434fccc..9ca0b0a0 100644 --- a/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc +++ b/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc @@ -81,6 +81,10 @@ int main(int argc, char **argv) { // that have a complex construction // standard RealD beta = 5.6 ; + const int nu = 3; + std::vector twists(Nd,0); + twists[nu] = 1; + ConjugateGimplD::setDirections(twists); ConjugateWilsonGaugeActionR Waction(beta); const int Ls = 8; @@ -93,9 +97,6 @@ int main(int argc, char **argv) { // temporarily need a gauge field LatticeGaugeField U(GridPtr); - const int nu = 3; - std::vector twists(Nd,0); - twists[nu] = 1; FermionAction::ImplParams params; params.twists = twists; Real mass=0.04; diff --git a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc index bc47b6c2..7f74d5d8 100644 --- a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc +++ b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc @@ -79,6 +79,10 @@ int main(int argc, char **argv) { // that have a complex construction // standard RealD beta = 2.6 ; + const int nu = 3; + std::vector twists(Nd,0); + twists[nu] = 1; + ConjugateGimplD::setDirections(twists); ConjugateIwasakiGaugeActionR Waction(beta); diff --git a/tests/hmc/Test_hmc_GparityWilsonGauge.cc b/tests/hmc/Test_hmc_GparityWilsonGauge.cc index eb057181..b8c078fe 100644 --- a/tests/hmc/Test_hmc_GparityWilsonGauge.cc +++ b/tests/hmc/Test_hmc_GparityWilsonGauge.cc @@ -80,6 +80,9 @@ int main(int argc, char **argv) { // that have a complex construction // standard RealD beta = 5.6 ; + std::vector twists(Nd,0); + twists[3] = 1; + ConjugateGimplD::setDirections(twists); ConjugateWilsonGaugeActionR Waction(beta); From 3c23a947cc4e22b6c01afd9eac5d5a4add9035c7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 15 Jan 2021 09:16:02 -0500 Subject: [PATCH 15/16] Fixed test for very much non-unit det --- tests/core/Test_reunitarise.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/Test_reunitarise.cc b/tests/core/Test_reunitarise.cc index af164a75..6644be1a 100644 --- a/tests/core/Test_reunitarise.cc +++ b/tests/core/Test_reunitarise.cc @@ -103,7 +103,7 @@ int main (int argc, char ** argv) detU= Determinant(U) ; detU=detU-1.0; - std::cout << "Determinant before screw up " << norm2(detU)< Date: Tue, 19 Jan 2021 12:32:48 +0000 Subject: [PATCH 16/16] revert changes --- tests/solver/Test_zMADWF_prec.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/solver/Test_zMADWF_prec.cc b/tests/solver/Test_zMADWF_prec.cc index f18e1d86..d1168764 100644 --- a/tests/solver/Test_zMADWF_prec.cc +++ b/tests/solver/Test_zMADWF_prec.cc @@ -52,7 +52,7 @@ struct TestParams{ bool zmobius_inner; double lambda_max; //upper bound of H_T eigenvalue range required to generate zMobius approximation - TestParams(): load_config(false), config_file("ckpoint_lat.1000"), mass(0.01), + TestParams(): load_config(true), config_file("ckpoint_lat.1000"), mass(0.01), Ls_outer(24), b_plus_c_outer(2.0), resid_outer(1e-8), Ls_inner(12), b_plus_c_inner(1.0), resid_inner(1e-8), zmobius_inner(true), lambda_max(1.42), outer_precon("Standard"), inner_precon("Standard") {} @@ -246,7 +246,7 @@ void run(const TestParams ¶ms){ typename RunParamsInner::SchurSolverType SchurSolver_inner(CG_inner); ZeroGuesser Guess; - MADWF > madwf(D_outer, D_inner, PV_outer, SchurSolver_inner, Guess, params.resid_outer, 10000, &update); + MADWF > madwf(D_outer, D_inner, PV_outer, SchurSolver_inner, Guess, params.resid_outer, 100, &update); LatticeFermionD result_MADWF(FGrid_outer); result_MADWF = Zero();