1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 19:25:56 +01:00

Merge pull request #21 from paboyle/develop

Sync
This commit is contained in:
Christoph Lehner 2020-12-22 20:59:15 +01:00 committed by GitHub
commit 299d0de066
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 497 additions and 4489 deletions

View File

@ -37,7 +37,9 @@ directory
#endif
//disables and intel compiler specific warning (in json.hpp)
#ifdef __ICC
#pragma warning disable 488
#endif
#ifdef __NVCC__
//disables nvcc specific warning in json.hpp

View File

@ -21,6 +21,7 @@ if BUILD_HDF5
extra_headers+=serialisation/Hdf5Type.h
endif
all: version-cache Version.h
version-cache:
@ -53,6 +54,17 @@ Version.h: version-cache
include Make.inc
include Eigen.inc
extra_sources+=$(ZWILS_FERMION_FILES)
extra_sources+=$(WILS_FERMION_FILES)
extra_sources+=$(STAG_FERMION_FILES)
if BUILD_GPARITY
extra_sources+=$(GP_FERMION_FILES)
endif
if BUILD_FERMION_REPS
extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES)
endif
lib_LIBRARIES = libGrid.a
CCFILES += $(extra_sources)

View File

@ -1,67 +0,0 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
NAMESPACE_END(Grid);

View File

@ -102,7 +102,7 @@ public:
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
};

View File

@ -715,7 +715,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
#ifdef GRID_CUDA
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);

View File

@ -29,6 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryNone: "
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended, use anonymous mmap
////////////////////////////////////////////////////////////////////////////////////////////
#if 1
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << header " SharedMemoryNone.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
WorldShmCommBufs[0] = ShmCommBuf;
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#else
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
@ -83,7 +116,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes=bytes;
_ShmAlloc=1;
};
#endif
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
acceleratorMemSet(dest,0,bytes);
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
acceleratorCopyToDevice(src,dest,bytes);
}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality

View File

@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
basis_v.push_back(basis[k].View(AcceleratorWrite));
}
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) )
#if ( (!defined(GRID_CUDA)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
@ -164,7 +164,8 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
auto basis_vp=& basis_v[0];
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz);
vobj zzz=Zero();
auto B=coalescedRead(zzz);
for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
}

View File

@ -97,42 +97,30 @@ public:
Coordinate icoor;
#ifdef GRID_SIMT
_Spinor tmp;
const int Nsimd =SiteDoubledGaugeField::Nsimd();
int s = acceleratorSIMTlane(Nsimd);
St.iCoorFromIindex(icoor,s);
int mmu = mu % Nd;
if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
int permute_lane = (sl==1)
|| ((distance== 1)&&(icoor[direction]==1))
|| ((distance==-1)&&(icoor[direction]==0));
if ( permute_lane ) {
tmp(0) = chi(1);
tmp(1) = chi(0);
} else {
tmp(0) = chi(0);
tmp(1) = chi(1);
}
auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
//Decide whether we do a G-parity flavor twist
//Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir
//It also assumes (but does not check) that abs(distance) == 1
int permute_lane = (sl==1)
|| ((distance== 1)&&(icoor[direction]==1))
|| ((distance==-1)&&(icoor[direction]==0));
auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
mult(&phi(0),&UU0,&tmp(0));
mult(&phi(1),&UU1,&tmp(1));
//Apply the links
int f_upper = permute_lane ? 1 : 0;
int f_lower = !f_upper;
} else {
auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
mult(&phi(0),&UU0,&chi(0));
mult(&phi(1),&UU1,&chi(1));
}
mult(&phi(0),&UU0,&chi(f_upper));
mult(&phi(1),&UU1,&chi(f_lower));
#else
typedef _Spinor vobj;

View File

@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
Current curr_type,
unsigned int mu)
{
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
#if (!defined(GRID_HIP))
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
}
#endif
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
#if (!defined(GRID_HIP))
int tshift = (mu == Nd-1) ? 1 : 0;
////////////////////////////////////////////////
// GENERAL CAYLEY CASE

View File

@ -38,9 +38,6 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
// undefine everything related to kernels
#include <simd/Fujitsu_A64FX_undef.h>
// enable A64FX body
#define WILSONKERNELSASMBODYA64FX
//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")
///////////////////////////////////////////////////////////
// If we are A64FX specialise the single precision routine
@ -63,119 +60,89 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
/////////////////////////////////////////////////////////////////
@ -185,119 +152,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// undefine
@ -330,119 +267,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
/////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, double
@ -451,124 +358,93 @@ WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// undefs
#undef WILSONKERNELSASMBODYA64FX
#include <simd/Fujitsu_A64FX_undef.h>
#endif //A64FXASM

View File

@ -25,6 +25,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// GCC 10 messes up SVE instruction scheduling using -O3, but
// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
// performance now is better than armclang 20.2
#ifdef KERNEL_DAG
#define DIR0_PROJ XP_PROJ
#define DIR1_PROJ YP_PROJ
@ -97,7 +102,7 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \
} else { \
LOAD_CHI(base); \
LOAD_CHI(base); \
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
MULT_2SPIN_1(Dir); \
@ -110,6 +115,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
} \
RECON; \
/*
NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
though I expected that it would improve on performance
*/
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PREFETCH1_CHIMU(base); \
@ -126,73 +136,63 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \
LOAD_CHIMU(base); \
LOAD_TABLE(PERMUTE_DIR); \
PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
PREFETCH_CHIMU_L2(basep); \
} else { PREFETCH_CHIMU(base); } \
if ( local ) { \
LOAD_CHIMU(base); \
LOAD_TABLE(PERMUTE_DIR); \
PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_1(Dir); \
MULT_2SPIN_2; \
RECON; \
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
PREFETCH_CHIMU(base); \
PREFETCH_CHIMU_L2(basep); \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PREFETCH1_CHIMU(base); \
{ ZERO_PSI; } \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif
////////////////////////////////////////////////////////////////////////////////
// Post comms kernel
////////////////////////////////////////////////////////////////////////////////
#ifdef EXTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
RECON; \
nmu++; \
}
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
nmu=0; \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
nmu=0; \
{ ZERO_PSI;} \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
RECON; \
nmu++; \
}
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
#endif
{
int nmu;
int local,perm, ptype;
@ -209,7 +209,6 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
// int sUn=lo.Reorder(ssn);
int sUn=ssn;
LOCK_GAUGE(0);
#else
int sU =ssU;
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
@ -295,6 +294,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
std::cout << "----------------------------------------------------" << std::endl;
#endif
// DC ZVA test
// { uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
#ifdef SHOW
@ -308,6 +312,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
std::cout << "----------------------------------------------------" << std::endl;
#endif
// DC ZVA test
//{ uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
#ifdef SHOW
@ -321,6 +330,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
std::cout << "----------------------------------------------------" << std::endl;
#endif
// DC ZVA test
//{ uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
#ifdef SHOW
@ -341,6 +355,7 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
base = (uint64_t) &out[ss];
basep= st.GetPFInfo(nent,plocal); ent++;
basep = (uint64_t) &out[ssn];
//PREFETCH_RESULT_L1_STORE(base);
RESULT(base,basep);
#ifdef SHOW

View File

@ -154,6 +154,10 @@ public:
return Hsum.real();
}
static inline void Project(Field &U) {
ProjectSUn(U);
}
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
SU<Nc>::HotConfiguration(pRNG, U);
}

View File

@ -54,6 +54,10 @@ public:
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
U = 1.0;
}
static inline void Project(Field &U) {
return;
}
static void MomentumSpacePropagator(Field &out, RealD m)
{
@ -234,6 +238,10 @@ public:
#endif //USE_FFT_ACCELERATION
}
static inline void Project(Field &U) {
return;
}
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
}

View File

@ -95,7 +95,7 @@ private:
typedef typename IntegratorType::Field Field;
typedef std::vector< HmcObservable<Field> * > ObsListType;
//pass these from the resource manager
GridSerialRNG &sRNG;
GridParallelRNG &pRNG;

View File

@ -313,6 +313,8 @@ public:
std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
}
FieldImplementation::Project(U);
// and that we indeed got to the end of the trajectory
assert(fabs(t_U - Params.trajL) < 1.0e-6);

View File

@ -820,7 +820,6 @@ LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N>
}}
ComplexD det = EigenU.determinant();
pokeLocalSite(det,ret_v,lcoor);
std::cout << " site " <<site<<" det " <<det <<std::endl;
});
return ret;
}
@ -830,8 +829,8 @@ static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
Umu = ProjectOnGroup(Umu);
auto det = Determinant(Umu);
det = pow(det,-1);
det = conjugate(det);
for(int i=0;i<N;i++){
auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
element = element * det;

View File

@ -1,779 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Fujitsu_A64FX_asm_double.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)
#define PF_GAUGE(A)
#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A)
#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A)
#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
#define LOCK_GAUGE(A)
#define UNLOCK_GAUGE(A)
#define MASK_REGS DECLARATIONS_A64FXd
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJ XP_PROJ_A64FXd
#define YP_PROJ YP_PROJ_A64FXd
#define ZP_PROJ ZP_PROJ_A64FXd
#define TP_PROJ TP_PROJ_A64FXd
#define XM_PROJ XM_PROJ_A64FXd
#define YM_PROJ YM_PROJ_A64FXd
#define ZM_PROJ ZM_PROJ_A64FXd
#define TM_PROJ TM_PROJ_A64FXd
#define XP_RECON XP_RECON_A64FXd
#define XM_RECON XM_RECON_A64FXd
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd
#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
#define PERMUTE_DIR0 0
#define PERMUTE_DIR1 1
#define PERMUTE_DIR2 2
#define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXd;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
// DECLARATIONS
#define DECLARATIONS_A64FXd \
const uint64_t lut[4][8] = { \
{4, 5, 6, 7, 0, 1, 2, 3}, \
{2, 3, 0, 1, 6, 7, 4, 5}, \
{1, 0, 3, 2, 5, 4, 7, 6}, \
{0, 1, 2, 4, 5, 6, 7, 8} };\
asm ( \
"fmov z31.d , 0 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// RESULT
#define RESULT_A64FXd(base) \
{ \
asm ( \
"str z0, [%[storeptr], -6, mul vl] \n\t" \
"str z1, [%[storeptr], -5, mul vl] \n\t" \
"str z2, [%[storeptr], -4, mul vl] \n\t" \
"str z3, [%[storeptr], -3, mul vl] \n\t" \
"str z4, [%[storeptr], -2, mul vl] \n\t" \
"str z5, [%[storeptr], -1, mul vl] \n\t" \
"str z6, [%[storeptr], 0, mul vl] \n\t" \
"str z7, [%[storeptr], 1, mul vl] \n\t" \
"str z8, [%[storeptr], 2, mul vl] \n\t" \
"str z9, [%[storeptr], 3, mul vl] \n\t" \
"str z10, [%[storeptr], 4, mul vl] \n\t" \
"str z11, [%[storeptr], 5, mul vl] \n\t" \
: \
: [storeptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_CHIMU_L2 (prefetch to L2)
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \
{ \
asm ( \
"prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_CHIMU_L1 (prefetch to L1)
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \
{ \
asm ( \
"prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_GAUGE_L2 (prefetch to L2)
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \
{ \
const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
asm ( \
"prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_GAUGE_L1 (prefetch to L1)
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
"prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHI
#define LOAD_CHI_A64FXd(base) \
{ \
asm ( \
"ldr z12, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z13, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z14, [%[fetchptr], 2, mul vl] \n\t" \
"ldr z15, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z16, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z17, [%[fetchptr], 5, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \
{ \
asm ( \
"ptrue p5.d \n\t" \
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU_0213
#define LOAD_CHIMU_0213_A64FXd \
{ \
const SiteSpinor & ref(in[offset]); \
asm ( \
"ptrue p5.d \n\t" \
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
: \
: [fetchptr] "r" (&ref[2][0]) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU_0312
#define LOAD_CHIMU_0312_A64FXd \
{ \
const SiteSpinor & ref(in[offset]); \
asm ( \
"ptrue p5.d \n\t" \
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (&ref[2][0]) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_TABLE0
#define LOAD_TABLE0 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE1
#define LOAD_TABLE1 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE2
#define LOAD_TABLE2 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE3
#define LOAD_TABLE3 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERMUTE
#define PERMUTE_A64FXd \
asm ( \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_GAUGE
#define LOAD_GAUGE \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
{ \
asm ( \
"ptrue p5.d \n\t" \
"ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// MULT_2SPIN
#define MULT_2SPIN_1_A64FXd(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
"ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
"movprfx z18.d, p5/m, z31.d \n\t" \
"fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \
"movprfx z21.d, p5/m, z31.d \n\t" \
"fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \
"movprfx z19.d, p5/m, z31.d \n\t" \
"fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \
"movprfx z22.d, p5/m, z31.d \n\t" \
"fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \
"movprfx z20.d, p5/m, z31.d \n\t" \
"fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \
"movprfx z23.d, p5/m, z31.d \n\t" \
"fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \
"fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \
"fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \
"fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \
"fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \
"fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \
"fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXd \
{ \
asm ( \
"fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
"fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
"fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
"fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \
"fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \
"fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \
"fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \
"fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \
"fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \
"fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \
"fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \
"fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \
"fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \
"fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \
"fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \
"fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \
"fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \
"fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \
"fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \
"fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \
"fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \
"fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \
"fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
"fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_PROJ
#define XP_PROJ_A64FXd \
{ \
asm ( \
"fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \
"fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \
"fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \
"fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \
"fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \
"fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_RECON
#define XP_RECON_A64FXd \
asm ( \
"movprfx z6.d, p5/m, z31.d \n\t" \
"fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
"movprfx z7.d, p5/m, z31.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
"movprfx z8.d, p5/m, z31.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
"movprfx z9.d, p5/m, z31.d \n\t" \
"fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
"movprfx z10.d, p5/m, z31.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
"movprfx z11.d, p5/m, z31.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
"mov z0.d, p5/m, z18.d \n\t" \
"mov z1.d, p5/m, z19.d \n\t" \
"mov z2.d, p5/m, z20.d \n\t" \
"mov z3.d, p5/m, z21.d \n\t" \
"mov z4.d, p5/m, z22.d \n\t" \
"mov z5.d, p5/m, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// XP_RECON_ACCUM
#define XP_RECON_ACCUM_A64FXd \
asm ( \
"fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YP_PROJ
#define YP_PROJ_A64FXd \
{ \
asm ( \
"fsub z12.d, p5/m, z12.d, z21.d \n\t" \
"fsub z13.d, p5/m, z13.d, z22.d \n\t" \
"fsub z14.d, p5/m, z14.d, z23.d \n\t" \
"fadd z15.d, p5/m, z15.d, z18.d \n\t" \
"fadd z16.d, p5/m, z16.d, z19.d \n\t" \
"fadd z17.d, p5/m, z17.d, z20.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZP_PROJ
#define ZP_PROJ_A64FXd \
{ \
asm ( \
"fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \
"fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \
"fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \
"fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \
"fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \
"fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TP_PROJ
#define TP_PROJ_A64FXd \
{ \
asm ( \
"fadd z12.d, p5/m, z12.d, z18.d \n\t" \
"fadd z13.d, p5/m, z13.d, z19.d \n\t" \
"fadd z14.d, p5/m, z14.d, z20.d \n\t" \
"fadd z15.d, p5/m, z15.d, z21.d \n\t" \
"fadd z16.d, p5/m, z16.d, z22.d \n\t" \
"fadd z17.d, p5/m, z17.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_PROJ
#define XM_PROJ_A64FXd \
{ \
asm ( \
"fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \
"fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \
"fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \
"fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \
"fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \
"fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON
#define XM_RECON_A64FXd \
asm ( \
"movprfx z6.d, p5/m, z31.d \n\t" \
"fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
"movprfx z7.d, p5/m, z31.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
"movprfx z8.d, p5/m, z31.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
"movprfx z9.d, p5/m, z31.d \n\t" \
"fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
"movprfx z10.d, p5/m, z31.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
"movprfx z11.d, p5/m, z31.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
"mov z0.d, p5/m, z18.d \n\t" \
"mov z1.d, p5/m, z19.d \n\t" \
"mov z2.d, p5/m, z20.d \n\t" \
"mov z3.d, p5/m, z21.d \n\t" \
"mov z4.d, p5/m, z22.d \n\t" \
"mov z5.d, p5/m, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YM_PROJ
#define YM_PROJ_A64FXd \
{ \
asm ( \
"fadd z12.d, p5/m, z12.d, z21.d \n\t" \
"fadd z13.d, p5/m, z13.d, z22.d \n\t" \
"fadd z14.d, p5/m, z14.d, z23.d \n\t" \
"fsub z15.d, p5/m, z15.d, z18.d \n\t" \
"fsub z16.d, p5/m, z16.d, z19.d \n\t" \
"fsub z17.d, p5/m, z17.d, z20.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZM_PROJ
#define ZM_PROJ_A64FXd \
{ \
asm ( \
"fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \
"fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \
"fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \
"fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \
"fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \
"fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TM_PROJ
#define TM_PROJ_A64FXd \
{ \
asm ( \
"ptrue p5.d \n\t" \
"fsub z12.d, p5/m, z12.d, z18.d \n\t" \
"fsub z13.d, p5/m, z13.d, z19.d \n\t" \
"fsub z14.d, p5/m, z14.d, z20.d \n\t" \
"fsub z15.d, p5/m, z15.d, z21.d \n\t" \
"fsub z16.d, p5/m, z16.d, z22.d \n\t" \
"fsub z17.d, p5/m, z17.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON_ACCUM
#define XM_RECON_ACCUM_A64FXd \
asm ( \
"fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
"fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fsub z9.d, p5/m, z9.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fsub z10.d, p5/m, z10.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fsub z11.d, p5/m, z11.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fadd z6.d, p5/m, z6.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fadd z7.d, p5/m, z7.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
"fadd z8.d, p5/m, z8.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YM_RECON_ACCUM
#define YM_RECON_ACCUM_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fadd z9.d, p5/m, z9.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fadd z10.d, p5/m, z10.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fadd z11.d, p5/m, z11.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fsub z6.d, p5/m, z6.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fsub z7.d, p5/m, z7.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
"fsub z8.d, p5/m, z8.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZP_RECON_ACCUM
#define ZP_RECON_ACCUM_A64FXd \
asm ( \
"fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZM_RECON_ACCUM
#define ZM_RECON_ACCUM_A64FXd \
asm ( \
"fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// TP_RECON_ACCUM
#define TP_RECON_ACCUM_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fadd z6.d, p5/m, z6.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fadd z7.d, p5/m, z7.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fadd z8.d, p5/m, z8.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fadd z9.d, p5/m, z9.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fadd z10.d, p5/m, z10.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
"fadd z11.d, p5/m, z11.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// TM_RECON_ACCUM
#define TM_RECON_ACCUM_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fsub z6.d, p5/m, z6.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fsub z7.d, p5/m, z7.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fsub z8.d, p5/m, z8.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fsub z9.d, p5/m, z9.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fsub z10.d, p5/m, z10.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
"fsub z11.d, p5/m, z11.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZERO_PSI
#define ZERO_PSI_A64FXd \
asm ( \
"ptrue p5.d \n\t" \
"fmov z0.d , 0 \n\t" \
"fmov z1.d , 0 \n\t" \
"fmov z2.d , 0 \n\t" \
"fmov z3.d , 0 \n\t" \
"fmov z4.d , 0 \n\t" \
"fmov z5.d , 0 \n\t" \
"fmov z6.d , 0 \n\t" \
"fmov z7.d , 0 \n\t" \
"fmov z8.d , 0 \n\t" \
"fmov z9.d , 0 \n\t" \
"fmov z10.d , 0 \n\t" \
"fmov z11.d , 0 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \
{ \
asm ( \
"prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \
{ \
asm ( \
"prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z12.d \n\t" \
"fadd z1.d, p5/m, z1.d, z13.d \n\t" \
"fadd z2.d, p5/m, z2.d, z14.d \n\t" \
"fadd z3.d, p5/m, z3.d, z15.d \n\t" \
"fadd z4.d, p5/m, z4.d, z16.d \n\t" \
"fadd z5.d, p5/m, z5.d, z17.d \n\t" \
"fadd z6.d, p5/m, z6.d, z18.d \n\t" \
"fadd z7.d, p5/m, z7.d, z19.d \n\t" \
"fadd z8.d, p5/m, z8.d, z20.d \n\t" \
"fadd z9.d, p5/m, z9.d, z21.d \n\t" \
"fadd z10.d, p5/m, z10.d, z22.d \n\t" \
"fadd z11.d, p5/m, z11.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);

View File

@ -1,779 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Fujitsu_A64FX_asm_single.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)
#define PF_GAUGE(A)
#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A)
#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A)
#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
#define LOCK_GAUGE(A)
#define UNLOCK_GAUGE(A)
#define MASK_REGS DECLARATIONS_A64FXf
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJ XP_PROJ_A64FXf
#define YP_PROJ YP_PROJ_A64FXf
#define ZP_PROJ ZP_PROJ_A64FXf
#define TP_PROJ TP_PROJ_A64FXf
#define XM_PROJ XM_PROJ_A64FXf
#define YM_PROJ YM_PROJ_A64FXf
#define ZM_PROJ ZM_PROJ_A64FXf
#define TM_PROJ TM_PROJ_A64FXf
#define XP_RECON XP_RECON_A64FXf
#define XM_RECON XM_RECON_A64FXf
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf
#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 0
#define PERMUTE_DIR1 1
#define PERMUTE_DIR2 2
#define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXf;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
// DECLARATIONS
#define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \
{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
{2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \
asm ( \
"fmov z31.s , 0 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// RESULT
#define RESULT_A64FXf(base) \
{ \
asm ( \
"str z0, [%[storeptr], -6, mul vl] \n\t" \
"str z1, [%[storeptr], -5, mul vl] \n\t" \
"str z2, [%[storeptr], -4, mul vl] \n\t" \
"str z3, [%[storeptr], -3, mul vl] \n\t" \
"str z4, [%[storeptr], -2, mul vl] \n\t" \
"str z5, [%[storeptr], -1, mul vl] \n\t" \
"str z6, [%[storeptr], 0, mul vl] \n\t" \
"str z7, [%[storeptr], 1, mul vl] \n\t" \
"str z8, [%[storeptr], 2, mul vl] \n\t" \
"str z9, [%[storeptr], 3, mul vl] \n\t" \
"str z10, [%[storeptr], 4, mul vl] \n\t" \
"str z11, [%[storeptr], 5, mul vl] \n\t" \
: \
: [storeptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_CHIMU_L2 (prefetch to L2)
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \
{ \
asm ( \
"prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_CHIMU_L1 (prefetch to L1)
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \
{ \
asm ( \
"prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_GAUGE_L2 (prefetch to L2)
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \
{ \
const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
asm ( \
"prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_GAUGE_L1 (prefetch to L1)
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
"prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHI
#define LOAD_CHI_A64FXf(base) \
{ \
asm ( \
"ldr z12, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z13, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z14, [%[fetchptr], 2, mul vl] \n\t" \
"ldr z15, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z16, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z17, [%[fetchptr], 5, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \
{ \
asm ( \
"ptrue p5.s \n\t" \
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU_0213
#define LOAD_CHIMU_0213_A64FXf \
{ \
const SiteSpinor & ref(in[offset]); \
asm ( \
"ptrue p5.s \n\t" \
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
: \
: [fetchptr] "r" (&ref[2][0]) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU_0312
#define LOAD_CHIMU_0312_A64FXf \
{ \
const SiteSpinor & ref(in[offset]); \
asm ( \
"ptrue p5.s \n\t" \
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (&ref[2][0]) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_TABLE0
#define LOAD_TABLE0 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE1
#define LOAD_TABLE1 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE2
#define LOAD_TABLE2 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE3
#define LOAD_TABLE3 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERMUTE
#define PERMUTE_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_GAUGE
#define LOAD_GAUGE \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
{ \
asm ( \
"ptrue p5.s \n\t" \
"ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// MULT_2SPIN
#define MULT_2SPIN_1_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
"ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
"movprfx z18.s, p5/m, z31.s \n\t" \
"fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \
"movprfx z21.s, p5/m, z31.s \n\t" \
"fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \
"movprfx z19.s, p5/m, z31.s \n\t" \
"fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \
"movprfx z22.s, p5/m, z31.s \n\t" \
"fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \
"movprfx z20.s, p5/m, z31.s \n\t" \
"fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \
"movprfx z23.s, p5/m, z31.s \n\t" \
"fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \
"fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \
"fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \
"fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \
"fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \
"fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \
"fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXf \
{ \
asm ( \
"fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \
"fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \
"fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \
"fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \
"fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \
"fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \
"fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \
"fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \
"fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \
"fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \
"fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \
"fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \
"fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \
"fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \
"fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \
"fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \
"fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \
"fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \
"fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \
"fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \
"fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \
"fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \
"fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \
"fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_PROJ
#define XP_PROJ_A64FXf \
{ \
asm ( \
"fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \
"fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \
"fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \
"fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \
"fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \
"fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_RECON
#define XP_RECON_A64FXf \
asm ( \
"movprfx z6.s, p5/m, z31.s \n\t" \
"fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
"movprfx z7.s, p5/m, z31.s \n\t" \
"fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
"movprfx z8.s, p5/m, z31.s \n\t" \
"fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
"movprfx z9.s, p5/m, z31.s \n\t" \
"fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
"movprfx z10.s, p5/m, z31.s \n\t" \
"fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
"movprfx z11.s, p5/m, z31.s \n\t" \
"fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
"mov z0.s, p5/m, z18.s \n\t" \
"mov z1.s, p5/m, z19.s \n\t" \
"mov z2.s, p5/m, z20.s \n\t" \
"mov z3.s, p5/m, z21.s \n\t" \
"mov z4.s, p5/m, z22.s \n\t" \
"mov z5.s, p5/m, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// XP_RECON_ACCUM
#define XP_RECON_ACCUM_A64FXf \
asm ( \
"fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YP_PROJ
#define YP_PROJ_A64FXf \
{ \
asm ( \
"fsub z12.s, p5/m, z12.s, z21.s \n\t" \
"fsub z13.s, p5/m, z13.s, z22.s \n\t" \
"fsub z14.s, p5/m, z14.s, z23.s \n\t" \
"fadd z15.s, p5/m, z15.s, z18.s \n\t" \
"fadd z16.s, p5/m, z16.s, z19.s \n\t" \
"fadd z17.s, p5/m, z17.s, z20.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZP_PROJ
#define ZP_PROJ_A64FXf \
{ \
asm ( \
"fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \
"fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \
"fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \
"fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \
"fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \
"fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TP_PROJ
#define TP_PROJ_A64FXf \
{ \
asm ( \
"fadd z12.s, p5/m, z12.s, z18.s \n\t" \
"fadd z13.s, p5/m, z13.s, z19.s \n\t" \
"fadd z14.s, p5/m, z14.s, z20.s \n\t" \
"fadd z15.s, p5/m, z15.s, z21.s \n\t" \
"fadd z16.s, p5/m, z16.s, z22.s \n\t" \
"fadd z17.s, p5/m, z17.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_PROJ
#define XM_PROJ_A64FXf \
{ \
asm ( \
"fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \
"fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \
"fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \
"fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \
"fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \
"fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON
#define XM_RECON_A64FXf \
asm ( \
"movprfx z6.s, p5/m, z31.s \n\t" \
"fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
"movprfx z7.s, p5/m, z31.s \n\t" \
"fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
"movprfx z8.s, p5/m, z31.s \n\t" \
"fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
"movprfx z9.s, p5/m, z31.s \n\t" \
"fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
"movprfx z10.s, p5/m, z31.s \n\t" \
"fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
"movprfx z11.s, p5/m, z31.s \n\t" \
"fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
"mov z0.s, p5/m, z18.s \n\t" \
"mov z1.s, p5/m, z19.s \n\t" \
"mov z2.s, p5/m, z20.s \n\t" \
"mov z3.s, p5/m, z21.s \n\t" \
"mov z4.s, p5/m, z22.s \n\t" \
"mov z5.s, p5/m, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YM_PROJ
#define YM_PROJ_A64FXf \
{ \
asm ( \
"fadd z12.s, p5/m, z12.s, z21.s \n\t" \
"fadd z13.s, p5/m, z13.s, z22.s \n\t" \
"fadd z14.s, p5/m, z14.s, z23.s \n\t" \
"fsub z15.s, p5/m, z15.s, z18.s \n\t" \
"fsub z16.s, p5/m, z16.s, z19.s \n\t" \
"fsub z17.s, p5/m, z17.s, z20.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZM_PROJ
#define ZM_PROJ_A64FXf \
{ \
asm ( \
"fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \
"fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \
"fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \
"fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \
"fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \
"fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TM_PROJ
#define TM_PROJ_A64FXf \
{ \
asm ( \
"ptrue p5.s \n\t" \
"fsub z12.s, p5/m, z12.s, z18.s \n\t" \
"fsub z13.s, p5/m, z13.s, z19.s \n\t" \
"fsub z14.s, p5/m, z14.s, z20.s \n\t" \
"fsub z15.s, p5/m, z15.s, z21.s \n\t" \
"fsub z16.s, p5/m, z16.s, z22.s \n\t" \
"fsub z17.s, p5/m, z17.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON_ACCUM
#define XM_RECON_ACCUM_A64FXf \
asm ( \
"fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
"fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
"fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
"fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
"fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
"fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXf \
asm ( \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fsub z9.s, p5/m, z9.s, z18.s \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fsub z10.s, p5/m, z10.s, z19.s \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fsub z11.s, p5/m, z11.s, z20.s \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fadd z6.s, p5/m, z6.s, z21.s \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fadd z7.s, p5/m, z7.s, z22.s \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
"fadd z8.s, p5/m, z8.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YM_RECON_ACCUM
#define YM_RECON_ACCUM_A64FXf \
asm ( \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fadd z9.s, p5/m, z9.s, z18.s \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fadd z10.s, p5/m, z10.s, z19.s \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fadd z11.s, p5/m, z11.s, z20.s \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fsub z6.s, p5/m, z6.s, z21.s \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fsub z7.s, p5/m, z7.s, z22.s \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
"fsub z8.s, p5/m, z8.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZP_RECON_ACCUM
#define ZP_RECON_ACCUM_A64FXf \
asm ( \
"fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZM_RECON_ACCUM
#define ZM_RECON_ACCUM_A64FXf \
asm ( \
"fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// TP_RECON_ACCUM
#define TP_RECON_ACCUM_A64FXf \
asm ( \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fadd z6.s, p5/m, z6.s, z18.s \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fadd z7.s, p5/m, z7.s, z19.s \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fadd z8.s, p5/m, z8.s, z20.s \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fadd z9.s, p5/m, z9.s, z21.s \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fadd z10.s, p5/m, z10.s, z22.s \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
"fadd z11.s, p5/m, z11.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// TM_RECON_ACCUM
#define TM_RECON_ACCUM_A64FXf \
asm ( \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fsub z6.s, p5/m, z6.s, z18.s \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fsub z7.s, p5/m, z7.s, z19.s \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fsub z8.s, p5/m, z8.s, z20.s \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fsub z9.s, p5/m, z9.s, z21.s \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fsub z10.s, p5/m, z10.s, z22.s \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
"fsub z11.s, p5/m, z11.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZERO_PSI
#define ZERO_PSI_A64FXf \
asm ( \
"ptrue p5.s \n\t" \
"fmov z0.s , 0 \n\t" \
"fmov z1.s , 0 \n\t" \
"fmov z2.s , 0 \n\t" \
"fmov z3.s , 0 \n\t" \
"fmov z4.s , 0 \n\t" \
"fmov z5.s , 0 \n\t" \
"fmov z6.s , 0 \n\t" \
"fmov z7.s , 0 \n\t" \
"fmov z8.s , 0 \n\t" \
"fmov z9.s , 0 \n\t" \
"fmov z10.s , 0 \n\t" \
"fmov z11.s , 0 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \
{ \
asm ( \
"prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \
{ \
asm ( \
"prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXf \
asm ( \
"fadd z0.s, p5/m, z0.s, z12.s \n\t" \
"fadd z1.s, p5/m, z1.s, z13.s \n\t" \
"fadd z2.s, p5/m, z2.s, z14.s \n\t" \
"fadd z3.s, p5/m, z3.s, z15.s \n\t" \
"fadd z4.s, p5/m, z4.s, z16.s \n\t" \
"fadd z5.s, p5/m, z5.s, z17.s \n\t" \
"fadd z6.s, p5/m, z6.s, z18.s \n\t" \
"fadd z7.s, p5/m, z7.s, z19.s \n\t" \
"fadd z8.s, p5/m, z8.s, z20.s \n\t" \
"fadd z9.s, p5/m, z9.s, z21.s \n\t" \
"fadd z10.s, p5/m, z10.s, z22.s \n\t" \
"fadd z11.s, p5/m, z11.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);

View File

@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define LOCK_GAUGE(A)
#define UNLOCK_GAUGE(A)
#define MASK_REGS DECLARATIONS_A64FXd
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
#define SAVE_RESULT(A,B) RESULT_A64FXd(A);
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ZERO_PSI ZERO_PSI_A64FXd
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJ XP_PROJ_A64FXd
#define YP_PROJ YP_PROJ_A64FXd
@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
// DECLARATIONS
#define DECLARATIONS_A64FXd \
uint64_t baseU; \
const uint64_t lut[4][8] = { \
{4, 5, 6, 7, 0, 1, 2, 3}, \
{2, 3, 0, 1, 6, 7, 4, 5}, \
@ -126,114 +128,114 @@ Author: Nils Meyer <nils.meyer@ur.de>
// RESULT
#define RESULT_A64FXd(base) \
{ \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \
}
// PREFETCH_CHIMU_L2 (prefetch to L2)
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \
}
// PREFETCH_CHIMU_L1 (prefetch to L1)
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \
svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \
svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \
}
// PREFETCH_GAUGE_L2 (prefetch to L2)
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \
{ \
const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \
}
// PREFETCH_GAUGE_L1 (prefetch to L1)
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \
}
// LOAD_CHI
#define LOAD_CHI_A64FXd(base) \
{ \
Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64)); \
Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64)); \
Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64)); \
Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64)); \
Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64)); \
Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64)); \
Chi_00 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(0)); \
Chi_01 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(1)); \
Chi_02 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(2)); \
Chi_10 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(3)); \
Chi_11 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(4)); \
Chi_12 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(5)); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \
{ \
Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
}
// LOAD_CHIMU_0213
#define LOAD_CHIMU_0213_A64FXd \
{ \
const SiteSpinor & ref(in[offset]); \
Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
}
// LOAD_CHIMU_0312
#define LOAD_CHIMU_0312_A64FXd \
{ \
const SiteSpinor & ref(in[offset]); \
Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
}
// LOAD_TABLE0
#define LOAD_TABLE0 \
@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de>
Chi_12 = svtbl(Chi_12, table0);
// LOAD_GAUGE
#define LOAD_GAUGE \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
#define LOAD_GAUGE(A) \
{ \
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \
U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \
U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \
U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \
U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \
U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \
}
// MULT_2SPIN
#define MULT_2SPIN_1_A64FXd(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \
U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \
U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \
U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \
U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \
U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \
UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \
U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \
U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXd \
@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
result_31 = svdup_f64(0.); \
result_32 = svdup_f64(0.);
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \
asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \
asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \
}
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \

View File

@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define LOCK_GAUGE(A)
#define UNLOCK_GAUGE(A)
#define MASK_REGS DECLARATIONS_A64FXf
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
#define SAVE_RESULT(A,B) RESULT_A64FXf(A);
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ZERO_PSI ZERO_PSI_A64FXf
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJ XP_PROJ_A64FXf
#define YP_PROJ YP_PROJ_A64FXf
@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
// DECLARATIONS
#define DECLARATIONS_A64FXf \
uint64_t baseU; \
const uint32_t lut[4][16] = { \
{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
@ -126,114 +128,114 @@ Author: Nils Meyer <nils.meyer@ur.de>
// RESULT
#define RESULT_A64FXf(base) \
{ \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \
}
// PREFETCH_CHIMU_L2 (prefetch to L2)
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \
}
// PREFETCH_CHIMU_L1 (prefetch to L1)
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \
svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \
svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \
}
// PREFETCH_GAUGE_L2 (prefetch to L2)
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \
{ \
const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \
}
// PREFETCH_GAUGE_L1 (prefetch to L1)
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \
}
// LOAD_CHI
#define LOAD_CHI_A64FXf(base) \
{ \
Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64)); \
Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64)); \
Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64)); \
Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64)); \
Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64)); \
Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64)); \
Chi_00 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(0)); \
Chi_01 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(1)); \
Chi_02 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(2)); \
Chi_10 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(3)); \
Chi_11 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(4)); \
Chi_12 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(5)); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \
{ \
Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
}
// LOAD_CHIMU_0213
#define LOAD_CHIMU_0213_A64FXf \
{ \
const SiteSpinor & ref(in[offset]); \
Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
}
// LOAD_CHIMU_0312
#define LOAD_CHIMU_0312_A64FXf \
{ \
const SiteSpinor & ref(in[offset]); \
Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
}
// LOAD_TABLE0
#define LOAD_TABLE0 \
@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de>
Chi_12 = svtbl(Chi_12, table0);
// LOAD_GAUGE
#define LOAD_GAUGE \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
#define LOAD_GAUGE(A) \
{ \
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \
U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \
U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \
U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \
U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \
U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \
}
// MULT_2SPIN
#define MULT_2SPIN_1_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \
U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \
U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \
U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \
U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \
U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \
UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \
U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \
U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXf \
@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
result_31 = svdup_f32(0.); \
result_32 = svdup_f32(0.);
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \
asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \
asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \
}
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \

View File

@ -46,6 +46,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
#undef MULT_2SPIN_2
#undef MAYBEPERM
#undef LOAD_CHI
#undef ZERO_PSI
#undef XP_PROJ
#undef YP_PROJ
#undef ZP_PROJ

View File

@ -38,12 +38,20 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_HIP
#include <hip/hip_fp16.h>
#endif
#ifdef GRID_SYCL
namespace Grid {
typedef struct { uint16_t x;} half;
typedef struct { half x; half y;} half2;
typedef struct { float x; float y;} float2;
typedef struct { double x; double y;} double2;
}
#endif
namespace Grid {
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
typedef struct { uint16_t x;} half;
#endif
typedef struct Half2_t { half x; half y; } Half2;
#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
@ -156,7 +164,7 @@ accelerator_inline float half2float(half h)
f = __half2float(h);
#else
Grid_half hh;
hh.x = hr.x;
hh.x = h.x;
f= sfw_half_to_float(hh);
#endif
return f;

File diff suppressed because it is too large Load Diff

View File

@ -24,7 +24,7 @@ typedef typename GparityDomainWallFermionD::FermionField GparityLatticeFermionD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
#ifdef ENABLE_GPARITY
int Ls=16;
for(int i=0;i<argc;i++)
if(std::string(argv[i]) == "-Ls"){
@ -184,7 +184,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
DwD.Report();
}
#endif
Grid_finalize();
}

View File

@ -123,6 +123,24 @@ case ${ac_LAPACK} in
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
esac
############### fermions
AC_ARG_ENABLE([fermion-reps],
[AC_HELP_STRING([--fermion-reps=yes|no], [enable extra fermion representation support])],
[ac_FERMION_REPS=${enable_fermion_reps}], [ac_FERMION_REPS=yes])
AM_CONDITIONAL(BUILD_FERMION_REPS, [ test "${ac_FERMION_REPS}X" == "yesX" ])
AC_ARG_ENABLE([gparity],
[AC_HELP_STRING([--enable-gparity=yes|no], [enable G-parity support])],
[ac_GPARITY=${enable_gparity}], [ac_GPARITY=yes])
AM_CONDITIONAL(BUILD_GPARITY, [ test "${ac_GPARITY}X" == "yesX" ])
case ${ac_FERMION_REPS} in
yes) AC_DEFINE([ENABLE_FERMION_REPS],[1],[non QCD fermion reps]);;
esac
case ${ac_GPARITY} in
yes) AC_DEFINE([ENABLE_GPARITY],[1],[fermion actions with GPARITY BCs]);;
esac
############### Nc
AC_ARG_ENABLE([Nc],
[AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],

View File

@ -6,13 +6,27 @@ home=`pwd`
cd $home/Grid
HFILES=`find . -type f -name '*.h' -not -name '*Hdf5*' -not -path '*/gamma-gen/*' -not -path '*/Old/*' -not -path '*/Eigen/*'`
HFILES="$HFILES"
CCFILES=`find . -name '*.cc' -not -path '*/gamma-gen/*' -not -name '*Communicator*.cc' -not -name '*SharedMemory*.cc' -not -name '*Hdf5*'`
CCFILES=`find . -name '*.cc' -not -path '*/instantiation/*/*' -not -path '*/gamma-gen/*' -not -name '*Communicator*.cc' -not -name '*SharedMemory*.cc' -not -name '*Hdf5*'`
ZWILS_FERMION_FILES=` find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/ZWilsonImpl*' `
WILS_FERMION_FILES=` find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/WilsonImpl*' `
STAG_FERMION_FILES=` find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/Staggered*' `
GP_FERMION_FILES=` find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/Gparity*' `
ADJ_FERMION_FILES=` find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/WilsonAdj*' `
TWOIND_FERMION_FILES=`find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/WilsonTwoIndex*'`
HPPFILES=`find . -type f -name '*.hpp'`
echo HFILES=$HFILES $HPPFILES > Make.inc
echo >> Make.inc
echo CCFILES=$CCFILES >> Make.inc
echo ZWILS_FERMION_FILES=$ZWILS_FERMION_FILES >> Make.inc
echo WILS_FERMION_FILES=$WILS_FERMION_FILES >> Make.inc
echo STAG_FERMION_FILES=$STAG_FERMION_FILES >> Make.inc
echo GP_FERMION_FILES=$GP_FERMION_FILES >> Make.inc
echo ADJ_FERMION_FILES=$ADJ_FERMION_FILES >> Make.inc
echo TWOIND_FERMION_FILES=$TWOIND_FERMION_FILES >> Make.inc
# tests Make.inc
cd $home/tests

View File

@ -102,7 +102,8 @@ int main (int argc, char ** argv)
LatticeComplexD detUU(grid);
detU= Determinant(U) ;
std::cout << "Determinant before screw up " <<detU<<std::endl;
detU=detU-1.0;
std::cout << "Determinant before screw up " << norm2(detU)<<std::endl;
std::cout << " Screwing up determinant " << std::endl;
@ -116,18 +117,24 @@ int main (int argc, char ** argv)
UU=U;
detU= Determinant(U) ;
std::cout << "Determinant after screw up " <<detU<<std::endl;
detU=detU-1.0;
std::cout << "Determinant defect before projection " <<norm2(detU)<<std::endl;
tmp = U*adj(U) - ident;
std::cout << "Unitarity check before projection " << norm2(tmp)<<std::endl;
ProjectSU3(U);
detU= Determinant(U) ;
std::cout << "Determinant ProjectSU3 " <<detU<<std::endl;
detU= detU -1.0;
std::cout << "Determinant ProjectSU3 defect " <<norm2(detU)<<std::endl;
tmp = U*adj(U) - ident;
std::cout << "Unitarity check after projection " << norm2(tmp)<<std::endl;
ProjectSUn<3>(UU);
ProjectSUn(UU);
detUU= Determinant(UU);
std::cout << "Determinant ProjectSUn " <<detUU<<std::endl;
detUU= detUU -1.0;
std::cout << "Determinant ProjectSUn defect " <<norm2(detUU)<<std::endl;
tmp = UU*adj(UU) - ident;
std::cout << "Unitarity check after projection " << norm2(tmp)<<std::endl;
Grid_finalize();
}

View File

@ -108,8 +108,18 @@ int main (int argc, char ** argv)
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid);
SU<Nc>::ColdConfiguration(Umu);
// SU<Nc>::HotConfiguration(RNG4,Umu);
if( argc > 1 && argv[1][0] != '-' )
{
std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
FieldMetaData header;
NerscIO::readConfiguration(Umu, header, argv[1]);
}
else
{
std::cout<<GridLogMessage <<"Using cold configuration"<<std::endl;
SU<Nc>::ColdConfiguration(Umu);
// SU<Nc>::HotConfiguration(RNG4,Umu);
}
RealD mass=0.3;
RealD M5 =1.0;