mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
commit
299d0de066
@ -37,7 +37,9 @@ directory
|
||||
#endif
|
||||
|
||||
//disables and intel compiler specific warning (in json.hpp)
|
||||
#ifdef __ICC
|
||||
#pragma warning disable 488
|
||||
#endif
|
||||
|
||||
#ifdef __NVCC__
|
||||
//disables nvcc specific warning in json.hpp
|
||||
|
@ -21,6 +21,7 @@ if BUILD_HDF5
|
||||
extra_headers+=serialisation/Hdf5Type.h
|
||||
endif
|
||||
|
||||
|
||||
all: version-cache Version.h
|
||||
|
||||
version-cache:
|
||||
@ -53,6 +54,17 @@ Version.h: version-cache
|
||||
include Make.inc
|
||||
include Eigen.inc
|
||||
|
||||
extra_sources+=$(ZWILS_FERMION_FILES)
|
||||
extra_sources+=$(WILS_FERMION_FILES)
|
||||
extra_sources+=$(STAG_FERMION_FILES)
|
||||
if BUILD_GPARITY
|
||||
extra_sources+=$(GP_FERMION_FILES)
|
||||
endif
|
||||
if BUILD_FERMION_REPS
|
||||
extra_sources+=$(ADJ_FERMION_FILES)
|
||||
extra_sources+=$(TWOIND_FERMION_FILES)
|
||||
endif
|
||||
|
||||
lib_LIBRARIES = libGrid.a
|
||||
|
||||
CCFILES += $(extra_sources)
|
||||
|
@ -1,67 +0,0 @@
|
||||
#include <Grid/GridCore.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
||||
bool MemoryProfiler::debug = false;
|
||||
|
||||
void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||
{
|
||||
#ifdef __linux__
|
||||
int fd = open("/proc/self/pagemap", O_RDONLY);
|
||||
assert(fd >= 0);
|
||||
const int page_size = 4096;
|
||||
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
||||
off_t offset = sizeof(uint64_t) * virt_pfn;
|
||||
uint64_t npages = (BYTES + page_size-1) / page_size;
|
||||
uint64_t pagedata[npages];
|
||||
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
||||
assert(ret == offset);
|
||||
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
||||
assert(ret == sizeof(uint64_t) * npages);
|
||||
int nhugepages = npages / 512;
|
||||
int n4ktotal, nnothuge;
|
||||
n4ktotal = 0;
|
||||
nnothuge = 0;
|
||||
for (int i = 0; i < nhugepages; ++i) {
|
||||
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
|
||||
for (int j = 0; j < 512; ++j) {
|
||||
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
|
||||
++n4ktotal;
|
||||
if (pageaddr != baseaddr + j * page_size)
|
||||
++nnothuge;
|
||||
}
|
||||
}
|
||||
int rank = CartesianCommunicator::RankWorld();
|
||||
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string sizeString(const size_t bytes)
|
||||
{
|
||||
constexpr unsigned int bufSize = 256;
|
||||
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
|
||||
char buf[256];
|
||||
size_t s = 0;
|
||||
double count = bytes;
|
||||
|
||||
while (count >= 1024 && s < 7)
|
||||
{
|
||||
s++;
|
||||
count /= 1024;
|
||||
}
|
||||
if (count - floor(count) == 0.0)
|
||||
{
|
||||
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
|
||||
}
|
||||
else
|
||||
{
|
||||
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
|
||||
}
|
||||
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -102,7 +102,7 @@ public:
|
||||
///////////////////////////////////////////////////
|
||||
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
||||
static void SharedMemoryFree(void);
|
||||
static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
|
||||
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
||||
static void SharedMemoryZero(void *dest,size_t bytes);
|
||||
|
||||
};
|
||||
|
@ -715,7 +715,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
bzero(dest,bytes);
|
||||
#endif
|
||||
}
|
||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
|
||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
{
|
||||
#ifdef GRID_CUDA
|
||||
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
||||
|
@ -29,6 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/GridCore.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
#define header "SharedMemoryNone: "
|
||||
|
||||
/*Construct from an MPI communicator*/
|
||||
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
||||
@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Hugetlbfs mapping intended, use anonymous mmap
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#if 1
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
|
||||
void * ShmCommBuf ;
|
||||
assert(_ShmSetup==1);
|
||||
assert(_ShmAlloc==0);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Each MPI rank should allocate our own buffer
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||
|
||||
if (ShmCommBuf == (void *)NULL ) {
|
||||
std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if ( WorldRank == 0 ){
|
||||
std::cout << WorldRank << header " SharedMemoryNone.cc acceleratorAllocDevice "<< bytes
|
||||
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||
}
|
||||
SharedMemoryZero(ShmCommBuf,bytes);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Loop over ranks/gpu's on our node
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
WorldShmCommBufs[0] = ShmCommBuf;
|
||||
|
||||
_ShmAllocBytes=bytes;
|
||||
_ShmAlloc=1;
|
||||
}
|
||||
#else
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
void * ShmCommBuf ;
|
||||
@ -83,7 +116,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
_ShmAllocBytes=bytes;
|
||||
_ShmAlloc=1;
|
||||
};
|
||||
|
||||
#endif
|
||||
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
{
|
||||
acceleratorMemSet(dest,0,bytes);
|
||||
}
|
||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
{
|
||||
acceleratorCopyToDevice(src,dest,bytes);
|
||||
}
|
||||
////////////////////////////////////////////////////////
|
||||
// Global shared functionality finished
|
||||
// Now move to per communicator functionality
|
||||
|
@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
||||
}
|
||||
|
||||
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) )
|
||||
#if ( (!defined(GRID_CUDA)) )
|
||||
int max_threads = thread_max();
|
||||
Vector < vobj > Bt(Nm * max_threads);
|
||||
thread_region
|
||||
@ -164,7 +164,8 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
||||
auto basis_vp=& basis_v[0];
|
||||
autoView(result_v,result,AcceleratorWrite);
|
||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
||||
auto B=coalescedRead(zz);
|
||||
vobj zzz=Zero();
|
||||
auto B=coalescedRead(zzz);
|
||||
for(int k=k0; k<k1; ++k){
|
||||
B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
|
||||
}
|
||||
|
@ -97,42 +97,30 @@ public:
|
||||
Coordinate icoor;
|
||||
|
||||
#ifdef GRID_SIMT
|
||||
_Spinor tmp;
|
||||
|
||||
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
||||
int s = acceleratorSIMTlane(Nsimd);
|
||||
St.iCoorFromIindex(icoor,s);
|
||||
|
||||
int mmu = mu % Nd;
|
||||
if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
|
||||
|
||||
int permute_lane = (sl==1)
|
||||
|| ((distance== 1)&&(icoor[direction]==1))
|
||||
|| ((distance==-1)&&(icoor[direction]==0));
|
||||
|
||||
if ( permute_lane ) {
|
||||
tmp(0) = chi(1);
|
||||
tmp(1) = chi(0);
|
||||
} else {
|
||||
tmp(0) = chi(0);
|
||||
tmp(1) = chi(1);
|
||||
}
|
||||
auto UU0=coalescedRead(U(0)(mu));
|
||||
auto UU1=coalescedRead(U(1)(mu));
|
||||
|
||||
//Decide whether we do a G-parity flavor twist
|
||||
//Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir
|
||||
//It also assumes (but does not check) that abs(distance) == 1
|
||||
int permute_lane = (sl==1)
|
||||
|| ((distance== 1)&&(icoor[direction]==1))
|
||||
|| ((distance==-1)&&(icoor[direction]==0));
|
||||
|
||||
auto UU0=coalescedRead(U(0)(mu));
|
||||
auto UU1=coalescedRead(U(1)(mu));
|
||||
permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
|
||||
|
||||
mult(&phi(0),&UU0,&tmp(0));
|
||||
mult(&phi(1),&UU1,&tmp(1));
|
||||
//Apply the links
|
||||
int f_upper = permute_lane ? 1 : 0;
|
||||
int f_lower = !f_upper;
|
||||
|
||||
} else {
|
||||
|
||||
auto UU0=coalescedRead(U(0)(mu));
|
||||
auto UU1=coalescedRead(U(1)(mu));
|
||||
|
||||
mult(&phi(0),&UU0,&chi(0));
|
||||
mult(&phi(1),&UU1,&chi(1));
|
||||
|
||||
}
|
||||
mult(&phi(0),&UU0,&chi(f_upper));
|
||||
mult(&phi(1),&UU1,&chi(f_lower));
|
||||
|
||||
#else
|
||||
typedef _Spinor vobj;
|
||||
|
@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
|
||||
Current curr_type,
|
||||
unsigned int mu)
|
||||
{
|
||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
||||
#if (!defined(GRID_HIP))
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
||||
#if (!defined(GRID_HIP))
|
||||
int tshift = (mu == Nd-1) ? 1 : 0;
|
||||
////////////////////////////////////////////////
|
||||
// GENERAL CAYLEY CASE
|
||||
|
@ -38,9 +38,6 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
// undefine everything related to kernels
|
||||
#include <simd/Fujitsu_A64FX_undef.h>
|
||||
|
||||
// enable A64FX body
|
||||
#define WILSONKERNELSASMBODYA64FX
|
||||
//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// If we are A64FX specialise the single precision routine
|
||||
@ -63,119 +60,89 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
@ -185,119 +152,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// undefine
|
||||
@ -330,119 +267,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, dag Kernel, double
|
||||
@ -451,124 +358,93 @@ WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
// undefs
|
||||
#undef WILSONKERNELSASMBODYA64FX
|
||||
#include <simd/Fujitsu_A64FX_undef.h>
|
||||
|
||||
#endif //A64FXASM
|
||||
|
@ -25,6 +25,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
// GCC 10 messes up SVE instruction scheduling using -O3, but
|
||||
// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
|
||||
// performance now is better than armclang 20.2
|
||||
|
||||
#ifdef KERNEL_DAG
|
||||
#define DIR0_PROJ XP_PROJ
|
||||
#define DIR1_PROJ YP_PROJ
|
||||
@ -97,7 +102,7 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
PROJ; \
|
||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||
} else { \
|
||||
LOAD_CHI(base); \
|
||||
LOAD_CHI(base); \
|
||||
} \
|
||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
@ -110,6 +115,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
} \
|
||||
RECON; \
|
||||
|
||||
/*
|
||||
NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
|
||||
though I expected that it would improve on performance
|
||||
*/
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
PREFETCH1_CHIMU(base); \
|
||||
@ -126,73 +136,63 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
|
||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU(base); \
|
||||
LOAD_TABLE(PERMUTE_DIR); \
|
||||
PROJ; \
|
||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
|
||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||
if ( local || st.same_node[Dir] ) { \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
PREFETCH_CHIMU(base); \
|
||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||
MULT_2SPIN_2; \
|
||||
if (s == 0) { \
|
||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||
} \
|
||||
RECON; \
|
||||
PREFETCH_CHIMU_L2(basep); \
|
||||
} else { PREFETCH_CHIMU(base); } \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU(base); \
|
||||
LOAD_TABLE(PERMUTE_DIR); \
|
||||
PROJ; \
|
||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
|
||||
if ( local || st.same_node[Dir] ) { \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
MULT_2SPIN_2; \
|
||||
RECON; \
|
||||
} \
|
||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||
PREFETCH_CHIMU(base); \
|
||||
PREFETCH_CHIMU_L2(basep); \
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
PREFETCH1_CHIMU(base); \
|
||||
{ ZERO_PSI; } \
|
||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||
|
||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Post comms kernel
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef EXTERIOR
|
||||
|
||||
|
||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||
LOAD_CHI(base); \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||
LOAD_CHI(base); \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
PREFETCH_CHIMU(base); \
|
||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||
MULT_2SPIN_2; \
|
||||
if (s == 0) { \
|
||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||
} \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
}
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
nmu=0; \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
|
||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||
LOAD_CHI(base); \
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
nmu=0; \
|
||||
{ ZERO_PSI;} \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||
LOAD_CHI(base); \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
PREFETCH_CHIMU(base); \
|
||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||
MULT_2SPIN_2; \
|
||||
if (s == 0) { \
|
||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||
} \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
}
|
||||
|
||||
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
{
|
||||
int nmu;
|
||||
int local,perm, ptype;
|
||||
@ -209,7 +209,6 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
||||
// int sUn=lo.Reorder(ssn);
|
||||
int sUn=ssn;
|
||||
LOCK_GAUGE(0);
|
||||
#else
|
||||
int sU =ssU;
|
||||
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
||||
@ -295,6 +294,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
// DC ZVA test
|
||||
// { uint64_t basestore = (uint64_t)&out[ss];
|
||||
// PREFETCH_RESULT_L2_STORE(basestore); }
|
||||
|
||||
|
||||
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
@ -308,6 +312,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
// DC ZVA test
|
||||
//{ uint64_t basestore = (uint64_t)&out[ss];
|
||||
// PREFETCH_RESULT_L2_STORE(basestore); }
|
||||
|
||||
|
||||
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
@ -321,6 +330,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
// DC ZVA test
|
||||
//{ uint64_t basestore = (uint64_t)&out[ss];
|
||||
// PREFETCH_RESULT_L2_STORE(basestore); }
|
||||
|
||||
|
||||
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
@ -341,6 +355,7 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
base = (uint64_t) &out[ss];
|
||||
basep= st.GetPFInfo(nent,plocal); ent++;
|
||||
basep = (uint64_t) &out[ssn];
|
||||
//PREFETCH_RESULT_L1_STORE(base);
|
||||
RESULT(base,basep);
|
||||
|
||||
#ifdef SHOW
|
||||
|
@ -154,6 +154,10 @@ public:
|
||||
return Hsum.real();
|
||||
}
|
||||
|
||||
static inline void Project(Field &U) {
|
||||
ProjectSUn(U);
|
||||
}
|
||||
|
||||
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
|
||||
SU<Nc>::HotConfiguration(pRNG, U);
|
||||
}
|
||||
|
@ -54,6 +54,10 @@ public:
|
||||
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
|
||||
U = 1.0;
|
||||
}
|
||||
|
||||
static inline void Project(Field &U) {
|
||||
return;
|
||||
}
|
||||
|
||||
static void MomentumSpacePropagator(Field &out, RealD m)
|
||||
{
|
||||
@ -234,6 +238,10 @@ public:
|
||||
#endif //USE_FFT_ACCELERATION
|
||||
}
|
||||
|
||||
static inline void Project(Field &U) {
|
||||
return;
|
||||
}
|
||||
|
||||
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
|
||||
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
|
||||
}
|
||||
|
@ -95,7 +95,7 @@ private:
|
||||
|
||||
typedef typename IntegratorType::Field Field;
|
||||
typedef std::vector< HmcObservable<Field> * > ObsListType;
|
||||
|
||||
|
||||
//pass these from the resource manager
|
||||
GridSerialRNG &sRNG;
|
||||
GridParallelRNG &pRNG;
|
||||
|
@ -313,6 +313,8 @@ public:
|
||||
std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
|
||||
}
|
||||
|
||||
FieldImplementation::Project(U);
|
||||
|
||||
// and that we indeed got to the end of the trajectory
|
||||
assert(fabs(t_U - Params.trajL) < 1.0e-6);
|
||||
|
||||
|
@ -820,7 +820,6 @@ LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N>
|
||||
}}
|
||||
ComplexD det = EigenU.determinant();
|
||||
pokeLocalSite(det,ret_v,lcoor);
|
||||
std::cout << " site " <<site<<" det " <<det <<std::endl;
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
@ -830,8 +829,8 @@ static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
|
||||
Umu = ProjectOnGroup(Umu);
|
||||
auto det = Determinant(Umu);
|
||||
|
||||
det = pow(det,-1);
|
||||
|
||||
det = conjugate(det);
|
||||
|
||||
for(int i=0;i<N;i++){
|
||||
auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
|
||||
element = element * det;
|
||||
|
@ -1,779 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: Fujitsu_A64FX_asm_double.h
|
||||
|
||||
Copyright (C) 2020
|
||||
|
||||
Author: Nils Meyer <nils.meyer@ur.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
|
||||
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)
|
||||
#define PF_GAUGE(A)
|
||||
#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A)
|
||||
#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
|
||||
#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
|
||||
#define LOCK_GAUGE(A)
|
||||
#define UNLOCK_GAUGE(A)
|
||||
#define MASK_REGS DECLARATIONS_A64FXd
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
|
||||
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
|
||||
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
|
||||
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
|
||||
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
|
||||
#define XP_PROJ XP_PROJ_A64FXd
|
||||
#define YP_PROJ YP_PROJ_A64FXd
|
||||
#define ZP_PROJ ZP_PROJ_A64FXd
|
||||
#define TP_PROJ TP_PROJ_A64FXd
|
||||
#define XM_PROJ XM_PROJ_A64FXd
|
||||
#define YM_PROJ YM_PROJ_A64FXd
|
||||
#define ZM_PROJ ZM_PROJ_A64FXd
|
||||
#define TM_PROJ TM_PROJ_A64FXd
|
||||
#define XP_RECON XP_RECON_A64FXd
|
||||
#define XM_RECON XM_RECON_A64FXd
|
||||
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
|
||||
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd
|
||||
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd
|
||||
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd
|
||||
#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd
|
||||
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
|
||||
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
|
||||
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
|
||||
#define PERMUTE_DIR0 0
|
||||
#define PERMUTE_DIR1 1
|
||||
#define PERMUTE_DIR2 2
|
||||
#define PERMUTE_DIR3 3
|
||||
#define PERMUTE PERMUTE_A64FXd;
|
||||
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
|
||||
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
|
||||
// DECLARATIONS
|
||||
#define DECLARATIONS_A64FXd \
|
||||
const uint64_t lut[4][8] = { \
|
||||
{4, 5, 6, 7, 0, 1, 2, 3}, \
|
||||
{2, 3, 0, 1, 6, 7, 4, 5}, \
|
||||
{1, 0, 3, 2, 5, 4, 7, 6}, \
|
||||
{0, 1, 2, 4, 5, 6, 7, 8} };\
|
||||
asm ( \
|
||||
"fmov z31.d , 0 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// RESULT
|
||||
#define RESULT_A64FXd(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"str z0, [%[storeptr], -6, mul vl] \n\t" \
|
||||
"str z1, [%[storeptr], -5, mul vl] \n\t" \
|
||||
"str z2, [%[storeptr], -4, mul vl] \n\t" \
|
||||
"str z3, [%[storeptr], -3, mul vl] \n\t" \
|
||||
"str z4, [%[storeptr], -2, mul vl] \n\t" \
|
||||
"str z5, [%[storeptr], -1, mul vl] \n\t" \
|
||||
"str z6, [%[storeptr], 0, mul vl] \n\t" \
|
||||
"str z7, [%[storeptr], 1, mul vl] \n\t" \
|
||||
"str z8, [%[storeptr], 2, mul vl] \n\t" \
|
||||
"str z9, [%[storeptr], 3, mul vl] \n\t" \
|
||||
"str z10, [%[storeptr], 4, mul vl] \n\t" \
|
||||
"str z11, [%[storeptr], 5, mul vl] \n\t" \
|
||||
: \
|
||||
: [storeptr] "r" (base + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// PREFETCH_CHIMU_L2 (prefetch to L2)
|
||||
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// PREFETCH_CHIMU_L1 (prefetch to L1)
|
||||
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// PREFETCH_GAUGE_L2 (prefetch to L2)
|
||||
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \
|
||||
{ \
|
||||
const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
|
||||
asm ( \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// PREFETCH_GAUGE_L1 (prefetch to L1)
|
||||
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
asm ( \
|
||||
"prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// LOAD_CHI
|
||||
#define LOAD_CHI_A64FXd(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"ldr z12, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ldr z13, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
"ldr z14, [%[fetchptr], 2, mul vl] \n\t" \
|
||||
"ldr z15, [%[fetchptr], 3, mul vl] \n\t" \
|
||||
"ldr z16, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"ldr z17, [%[fetchptr], 5, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// LOAD_CHIMU
|
||||
#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"ptrue p5.d \n\t" \
|
||||
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
|
||||
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
|
||||
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
|
||||
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
|
||||
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
|
||||
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
|
||||
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// LOAD_CHIMU_0213
|
||||
#define LOAD_CHIMU_0213_A64FXd \
|
||||
{ \
|
||||
const SiteSpinor & ref(in[offset]); \
|
||||
asm ( \
|
||||
"ptrue p5.d \n\t" \
|
||||
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
|
||||
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
|
||||
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
|
||||
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
|
||||
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
|
||||
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
|
||||
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (&ref[2][0]) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// LOAD_CHIMU_0312
|
||||
#define LOAD_CHIMU_0312_A64FXd \
|
||||
{ \
|
||||
const SiteSpinor & ref(in[offset]); \
|
||||
asm ( \
|
||||
"ptrue p5.d \n\t" \
|
||||
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
|
||||
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
|
||||
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
|
||||
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
|
||||
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
|
||||
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
|
||||
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (&ref[2][0]) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// LOAD_TABLE0
|
||||
#define LOAD_TABLE0 \
|
||||
asm ( \
|
||||
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
|
||||
: \
|
||||
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
|
||||
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// LOAD_TABLE1
|
||||
#define LOAD_TABLE1 \
|
||||
asm ( \
|
||||
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
|
||||
: \
|
||||
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
|
||||
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// LOAD_TABLE2
|
||||
#define LOAD_TABLE2 \
|
||||
asm ( \
|
||||
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
|
||||
: \
|
||||
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
|
||||
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// LOAD_TABLE3
|
||||
#define LOAD_TABLE3 \
|
||||
asm ( \
|
||||
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
|
||||
: \
|
||||
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
|
||||
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PERMUTE
|
||||
#define PERMUTE_A64FXd \
|
||||
asm ( \
|
||||
"tbl z12.d, { z12.d }, z30.d \n\t" \
|
||||
"tbl z13.d, { z13.d }, z30.d \n\t" \
|
||||
"tbl z14.d, { z14.d }, z30.d \n\t" \
|
||||
"tbl z15.d, { z15.d }, z30.d \n\t" \
|
||||
"tbl z16.d, { z16.d }, z30.d \n\t" \
|
||||
"tbl z17.d, { z17.d }, z30.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// LOAD_GAUGE
|
||||
#define LOAD_GAUGE \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
{ \
|
||||
asm ( \
|
||||
"ptrue p5.d \n\t" \
|
||||
"ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
|
||||
"ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
|
||||
"ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_1_A64FXd(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
asm ( \
|
||||
"ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
|
||||
"ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
|
||||
"ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
"movprfx z18.d, p5/m, z31.d \n\t" \
|
||||
"fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \
|
||||
"movprfx z21.d, p5/m, z31.d \n\t" \
|
||||
"fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \
|
||||
"movprfx z19.d, p5/m, z31.d \n\t" \
|
||||
"fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \
|
||||
"movprfx z22.d, p5/m, z31.d \n\t" \
|
||||
"fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \
|
||||
"movprfx z20.d, p5/m, z31.d \n\t" \
|
||||
"fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \
|
||||
"movprfx z23.d, p5/m, z31.d \n\t" \
|
||||
"fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \
|
||||
"fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \
|
||||
"fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \
|
||||
"fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \
|
||||
"fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \
|
||||
"fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \
|
||||
"fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \
|
||||
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
|
||||
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
|
||||
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// MULT_2SPIN_BACKEND
|
||||
#define MULT_2SPIN_2_A64FXd \
|
||||
{ \
|
||||
asm ( \
|
||||
"fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
|
||||
"fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
|
||||
"fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
|
||||
"fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \
|
||||
"fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \
|
||||
"fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \
|
||||
"fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \
|
||||
"fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \
|
||||
"fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \
|
||||
"fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \
|
||||
"fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \
|
||||
"fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \
|
||||
"fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \
|
||||
"fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \
|
||||
"fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \
|
||||
"fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \
|
||||
"fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \
|
||||
"fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \
|
||||
"fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \
|
||||
"fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \
|
||||
"fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \
|
||||
"fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \
|
||||
"fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
|
||||
"fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// XP_PROJ
|
||||
#define XP_PROJ_A64FXd \
|
||||
{ \
|
||||
asm ( \
|
||||
"fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \
|
||||
"fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \
|
||||
"fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \
|
||||
"fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \
|
||||
"fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \
|
||||
"fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// XP_RECON
|
||||
#define XP_RECON_A64FXd \
|
||||
asm ( \
|
||||
"movprfx z6.d, p5/m, z31.d \n\t" \
|
||||
"fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
|
||||
"movprfx z7.d, p5/m, z31.d \n\t" \
|
||||
"fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
|
||||
"movprfx z8.d, p5/m, z31.d \n\t" \
|
||||
"fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
|
||||
"movprfx z9.d, p5/m, z31.d \n\t" \
|
||||
"fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
|
||||
"movprfx z10.d, p5/m, z31.d \n\t" \
|
||||
"fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
|
||||
"movprfx z11.d, p5/m, z31.d \n\t" \
|
||||
"fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
|
||||
"mov z0.d, p5/m, z18.d \n\t" \
|
||||
"mov z1.d, p5/m, z19.d \n\t" \
|
||||
"mov z2.d, p5/m, z20.d \n\t" \
|
||||
"mov z3.d, p5/m, z21.d \n\t" \
|
||||
"mov z4.d, p5/m, z22.d \n\t" \
|
||||
"mov z5.d, p5/m, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// XP_RECON_ACCUM
|
||||
#define XP_RECON_ACCUM_A64FXd \
|
||||
asm ( \
|
||||
"fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
|
||||
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
|
||||
"fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
|
||||
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
|
||||
"fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
|
||||
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
|
||||
"fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
|
||||
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
|
||||
"fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
|
||||
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
|
||||
"fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
|
||||
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// YP_PROJ
|
||||
#define YP_PROJ_A64FXd \
|
||||
{ \
|
||||
asm ( \
|
||||
"fsub z12.d, p5/m, z12.d, z21.d \n\t" \
|
||||
"fsub z13.d, p5/m, z13.d, z22.d \n\t" \
|
||||
"fsub z14.d, p5/m, z14.d, z23.d \n\t" \
|
||||
"fadd z15.d, p5/m, z15.d, z18.d \n\t" \
|
||||
"fadd z16.d, p5/m, z16.d, z19.d \n\t" \
|
||||
"fadd z17.d, p5/m, z17.d, z20.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// ZP_PROJ
|
||||
#define ZP_PROJ_A64FXd \
|
||||
{ \
|
||||
asm ( \
|
||||
"fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \
|
||||
"fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \
|
||||
"fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \
|
||||
"fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \
|
||||
"fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \
|
||||
"fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// TP_PROJ
|
||||
#define TP_PROJ_A64FXd \
|
||||
{ \
|
||||
asm ( \
|
||||
"fadd z12.d, p5/m, z12.d, z18.d \n\t" \
|
||||
"fadd z13.d, p5/m, z13.d, z19.d \n\t" \
|
||||
"fadd z14.d, p5/m, z14.d, z20.d \n\t" \
|
||||
"fadd z15.d, p5/m, z15.d, z21.d \n\t" \
|
||||
"fadd z16.d, p5/m, z16.d, z22.d \n\t" \
|
||||
"fadd z17.d, p5/m, z17.d, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// XM_PROJ
|
||||
#define XM_PROJ_A64FXd \
|
||||
{ \
|
||||
asm ( \
|
||||
"fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \
|
||||
"fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \
|
||||
"fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \
|
||||
"fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \
|
||||
"fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \
|
||||
"fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// XM_RECON
|
||||
#define XM_RECON_A64FXd \
|
||||
asm ( \
|
||||
"movprfx z6.d, p5/m, z31.d \n\t" \
|
||||
"fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
|
||||
"movprfx z7.d, p5/m, z31.d \n\t" \
|
||||
"fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
|
||||
"movprfx z8.d, p5/m, z31.d \n\t" \
|
||||
"fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
|
||||
"movprfx z9.d, p5/m, z31.d \n\t" \
|
||||
"fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
|
||||
"movprfx z10.d, p5/m, z31.d \n\t" \
|
||||
"fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
|
||||
"movprfx z11.d, p5/m, z31.d \n\t" \
|
||||
"fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
|
||||
"mov z0.d, p5/m, z18.d \n\t" \
|
||||
"mov z1.d, p5/m, z19.d \n\t" \
|
||||
"mov z2.d, p5/m, z20.d \n\t" \
|
||||
"mov z3.d, p5/m, z21.d \n\t" \
|
||||
"mov z4.d, p5/m, z22.d \n\t" \
|
||||
"mov z5.d, p5/m, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// YM_PROJ
|
||||
#define YM_PROJ_A64FXd \
|
||||
{ \
|
||||
asm ( \
|
||||
"fadd z12.d, p5/m, z12.d, z21.d \n\t" \
|
||||
"fadd z13.d, p5/m, z13.d, z22.d \n\t" \
|
||||
"fadd z14.d, p5/m, z14.d, z23.d \n\t" \
|
||||
"fsub z15.d, p5/m, z15.d, z18.d \n\t" \
|
||||
"fsub z16.d, p5/m, z16.d, z19.d \n\t" \
|
||||
"fsub z17.d, p5/m, z17.d, z20.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// ZM_PROJ
|
||||
#define ZM_PROJ_A64FXd \
|
||||
{ \
|
||||
asm ( \
|
||||
"fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \
|
||||
"fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \
|
||||
"fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \
|
||||
"fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \
|
||||
"fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \
|
||||
"fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// TM_PROJ
|
||||
#define TM_PROJ_A64FXd \
|
||||
{ \
|
||||
asm ( \
|
||||
"ptrue p5.d \n\t" \
|
||||
"fsub z12.d, p5/m, z12.d, z18.d \n\t" \
|
||||
"fsub z13.d, p5/m, z13.d, z19.d \n\t" \
|
||||
"fsub z14.d, p5/m, z14.d, z20.d \n\t" \
|
||||
"fsub z15.d, p5/m, z15.d, z21.d \n\t" \
|
||||
"fsub z16.d, p5/m, z16.d, z22.d \n\t" \
|
||||
"fsub z17.d, p5/m, z17.d, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// XM_RECON_ACCUM
|
||||
#define XM_RECON_ACCUM_A64FXd \
|
||||
asm ( \
|
||||
"fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
|
||||
"fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
|
||||
"fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
|
||||
"fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
|
||||
"fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
|
||||
"fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
|
||||
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
|
||||
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
|
||||
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
|
||||
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
|
||||
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
|
||||
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// YP_RECON_ACCUM
|
||||
#define YP_RECON_ACCUM_A64FXd \
|
||||
asm ( \
|
||||
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
|
||||
"fsub z9.d, p5/m, z9.d, z18.d \n\t" \
|
||||
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
|
||||
"fsub z10.d, p5/m, z10.d, z19.d \n\t" \
|
||||
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
|
||||
"fsub z11.d, p5/m, z11.d, z20.d \n\t" \
|
||||
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
|
||||
"fadd z6.d, p5/m, z6.d, z21.d \n\t" \
|
||||
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
|
||||
"fadd z7.d, p5/m, z7.d, z22.d \n\t" \
|
||||
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
|
||||
"fadd z8.d, p5/m, z8.d, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// YM_RECON_ACCUM
|
||||
#define YM_RECON_ACCUM_A64FXd \
|
||||
asm ( \
|
||||
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
|
||||
"fadd z9.d, p5/m, z9.d, z18.d \n\t" \
|
||||
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
|
||||
"fadd z10.d, p5/m, z10.d, z19.d \n\t" \
|
||||
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
|
||||
"fadd z11.d, p5/m, z11.d, z20.d \n\t" \
|
||||
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
|
||||
"fsub z6.d, p5/m, z6.d, z21.d \n\t" \
|
||||
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
|
||||
"fsub z7.d, p5/m, z7.d, z22.d \n\t" \
|
||||
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
|
||||
"fsub z8.d, p5/m, z8.d, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// ZP_RECON_ACCUM
|
||||
#define ZP_RECON_ACCUM_A64FXd \
|
||||
asm ( \
|
||||
"fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \
|
||||
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
|
||||
"fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \
|
||||
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
|
||||
"fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \
|
||||
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
|
||||
"fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \
|
||||
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
|
||||
"fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \
|
||||
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
|
||||
"fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \
|
||||
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// ZM_RECON_ACCUM
|
||||
#define ZM_RECON_ACCUM_A64FXd \
|
||||
asm ( \
|
||||
"fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \
|
||||
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
|
||||
"fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \
|
||||
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
|
||||
"fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \
|
||||
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
|
||||
"fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \
|
||||
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
|
||||
"fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \
|
||||
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
|
||||
"fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \
|
||||
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// TP_RECON_ACCUM
|
||||
#define TP_RECON_ACCUM_A64FXd \
|
||||
asm ( \
|
||||
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
|
||||
"fadd z6.d, p5/m, z6.d, z18.d \n\t" \
|
||||
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
|
||||
"fadd z7.d, p5/m, z7.d, z19.d \n\t" \
|
||||
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
|
||||
"fadd z8.d, p5/m, z8.d, z20.d \n\t" \
|
||||
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
|
||||
"fadd z9.d, p5/m, z9.d, z21.d \n\t" \
|
||||
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
|
||||
"fadd z10.d, p5/m, z10.d, z22.d \n\t" \
|
||||
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
|
||||
"fadd z11.d, p5/m, z11.d, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// TM_RECON_ACCUM
|
||||
#define TM_RECON_ACCUM_A64FXd \
|
||||
asm ( \
|
||||
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
|
||||
"fsub z6.d, p5/m, z6.d, z18.d \n\t" \
|
||||
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
|
||||
"fsub z7.d, p5/m, z7.d, z19.d \n\t" \
|
||||
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
|
||||
"fsub z8.d, p5/m, z8.d, z20.d \n\t" \
|
||||
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
|
||||
"fsub z9.d, p5/m, z9.d, z21.d \n\t" \
|
||||
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
|
||||
"fsub z10.d, p5/m, z10.d, z22.d \n\t" \
|
||||
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
|
||||
"fsub z11.d, p5/m, z11.d, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// ZERO_PSI
|
||||
#define ZERO_PSI_A64FXd \
|
||||
asm ( \
|
||||
"ptrue p5.d \n\t" \
|
||||
"fmov z0.d , 0 \n\t" \
|
||||
"fmov z1.d , 0 \n\t" \
|
||||
"fmov z2.d , 0 \n\t" \
|
||||
"fmov z3.d , 0 \n\t" \
|
||||
"fmov z4.d , 0 \n\t" \
|
||||
"fmov z5.d , 0 \n\t" \
|
||||
"fmov z6.d , 0 \n\t" \
|
||||
"fmov z7.d , 0 \n\t" \
|
||||
"fmov z8.d , 0 \n\t" \
|
||||
"fmov z9.d , 0 \n\t" \
|
||||
"fmov z10.d , 0 \n\t" \
|
||||
"fmov z11.d , 0 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
|
||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
|
||||
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// ADD_RESULT_INTERNAL
|
||||
#define ADD_RESULT_INTERNAL_A64FXd \
|
||||
asm ( \
|
||||
"fadd z0.d, p5/m, z0.d, z12.d \n\t" \
|
||||
"fadd z1.d, p5/m, z1.d, z13.d \n\t" \
|
||||
"fadd z2.d, p5/m, z2.d, z14.d \n\t" \
|
||||
"fadd z3.d, p5/m, z3.d, z15.d \n\t" \
|
||||
"fadd z4.d, p5/m, z4.d, z16.d \n\t" \
|
||||
"fadd z5.d, p5/m, z5.d, z17.d \n\t" \
|
||||
"fadd z6.d, p5/m, z6.d, z18.d \n\t" \
|
||||
"fadd z7.d, p5/m, z7.d, z19.d \n\t" \
|
||||
"fadd z8.d, p5/m, z8.d, z20.d \n\t" \
|
||||
"fadd z9.d, p5/m, z9.d, z21.d \n\t" \
|
||||
"fadd z10.d, p5/m, z10.d, z22.d \n\t" \
|
||||
"fadd z11.d, p5/m, z11.d, z23.d \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
@ -1,779 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: Fujitsu_A64FX_asm_single.h
|
||||
|
||||
Copyright (C) 2020
|
||||
|
||||
Author: Nils Meyer <nils.meyer@ur.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
|
||||
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)
|
||||
#define PF_GAUGE(A)
|
||||
#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A)
|
||||
#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
|
||||
#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
|
||||
#define LOCK_GAUGE(A)
|
||||
#define UNLOCK_GAUGE(A)
|
||||
#define MASK_REGS DECLARATIONS_A64FXf
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
|
||||
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
|
||||
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
|
||||
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
|
||||
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
|
||||
#define XP_PROJ XP_PROJ_A64FXf
|
||||
#define YP_PROJ YP_PROJ_A64FXf
|
||||
#define ZP_PROJ ZP_PROJ_A64FXf
|
||||
#define TP_PROJ TP_PROJ_A64FXf
|
||||
#define XM_PROJ XM_PROJ_A64FXf
|
||||
#define YM_PROJ YM_PROJ_A64FXf
|
||||
#define ZM_PROJ ZM_PROJ_A64FXf
|
||||
#define TM_PROJ TM_PROJ_A64FXf
|
||||
#define XP_RECON XP_RECON_A64FXf
|
||||
#define XM_RECON XM_RECON_A64FXf
|
||||
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
|
||||
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf
|
||||
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf
|
||||
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf
|
||||
#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf
|
||||
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
|
||||
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
|
||||
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
|
||||
#define PERMUTE_DIR0 0
|
||||
#define PERMUTE_DIR1 1
|
||||
#define PERMUTE_DIR2 2
|
||||
#define PERMUTE_DIR3 3
|
||||
#define PERMUTE PERMUTE_A64FXf;
|
||||
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
|
||||
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
|
||||
// DECLARATIONS
|
||||
#define DECLARATIONS_A64FXf \
|
||||
const uint32_t lut[4][16] = { \
|
||||
{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
|
||||
{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
|
||||
{2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \
|
||||
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \
|
||||
asm ( \
|
||||
"fmov z31.s , 0 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// RESULT
|
||||
#define RESULT_A64FXf(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"str z0, [%[storeptr], -6, mul vl] \n\t" \
|
||||
"str z1, [%[storeptr], -5, mul vl] \n\t" \
|
||||
"str z2, [%[storeptr], -4, mul vl] \n\t" \
|
||||
"str z3, [%[storeptr], -3, mul vl] \n\t" \
|
||||
"str z4, [%[storeptr], -2, mul vl] \n\t" \
|
||||
"str z5, [%[storeptr], -1, mul vl] \n\t" \
|
||||
"str z6, [%[storeptr], 0, mul vl] \n\t" \
|
||||
"str z7, [%[storeptr], 1, mul vl] \n\t" \
|
||||
"str z8, [%[storeptr], 2, mul vl] \n\t" \
|
||||
"str z9, [%[storeptr], 3, mul vl] \n\t" \
|
||||
"str z10, [%[storeptr], 4, mul vl] \n\t" \
|
||||
"str z11, [%[storeptr], 5, mul vl] \n\t" \
|
||||
: \
|
||||
: [storeptr] "r" (base + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// PREFETCH_CHIMU_L2 (prefetch to L2)
|
||||
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// PREFETCH_CHIMU_L1 (prefetch to L1)
|
||||
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// PREFETCH_GAUGE_L2 (prefetch to L2)
|
||||
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \
|
||||
{ \
|
||||
const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
|
||||
asm ( \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
|
||||
"prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// PREFETCH_GAUGE_L1 (prefetch to L1)
|
||||
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
asm ( \
|
||||
"prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// LOAD_CHI
|
||||
#define LOAD_CHI_A64FXf(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"ldr z12, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ldr z13, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
"ldr z14, [%[fetchptr], 2, mul vl] \n\t" \
|
||||
"ldr z15, [%[fetchptr], 3, mul vl] \n\t" \
|
||||
"ldr z16, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"ldr z17, [%[fetchptr], 5, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// LOAD_CHIMU
|
||||
#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"ptrue p5.s \n\t" \
|
||||
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
|
||||
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
|
||||
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
|
||||
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
|
||||
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
|
||||
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
|
||||
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// LOAD_CHIMU_0213
|
||||
#define LOAD_CHIMU_0213_A64FXf \
|
||||
{ \
|
||||
const SiteSpinor & ref(in[offset]); \
|
||||
asm ( \
|
||||
"ptrue p5.s \n\t" \
|
||||
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
|
||||
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
|
||||
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
|
||||
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
|
||||
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
|
||||
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
|
||||
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (&ref[2][0]) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// LOAD_CHIMU_0312
|
||||
#define LOAD_CHIMU_0312_A64FXf \
|
||||
{ \
|
||||
const SiteSpinor & ref(in[offset]); \
|
||||
asm ( \
|
||||
"ptrue p5.s \n\t" \
|
||||
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
|
||||
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
|
||||
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
|
||||
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
|
||||
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
|
||||
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
|
||||
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (&ref[2][0]) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// LOAD_TABLE0
|
||||
#define LOAD_TABLE0 \
|
||||
asm ( \
|
||||
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
|
||||
: \
|
||||
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
|
||||
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// LOAD_TABLE1
|
||||
#define LOAD_TABLE1 \
|
||||
asm ( \
|
||||
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
|
||||
: \
|
||||
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
|
||||
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// LOAD_TABLE2
|
||||
#define LOAD_TABLE2 \
|
||||
asm ( \
|
||||
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
|
||||
: \
|
||||
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
|
||||
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// LOAD_TABLE3
|
||||
#define LOAD_TABLE3 \
|
||||
asm ( \
|
||||
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
|
||||
: \
|
||||
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
|
||||
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PERMUTE
|
||||
#define PERMUTE_A64FXf \
|
||||
asm ( \
|
||||
"tbl z12.s, { z12.s }, z30.s \n\t" \
|
||||
"tbl z13.s, { z13.s }, z30.s \n\t" \
|
||||
"tbl z14.s, { z14.s }, z30.s \n\t" \
|
||||
"tbl z15.s, { z15.s }, z30.s \n\t" \
|
||||
"tbl z16.s, { z16.s }, z30.s \n\t" \
|
||||
"tbl z17.s, { z17.s }, z30.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// LOAD_GAUGE
|
||||
#define LOAD_GAUGE \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
{ \
|
||||
asm ( \
|
||||
"ptrue p5.s \n\t" \
|
||||
"ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
|
||||
"ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
|
||||
"ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_1_A64FXf(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
asm ( \
|
||||
"ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
|
||||
"ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
|
||||
"ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
|
||||
"ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
|
||||
"ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
|
||||
"movprfx z18.s, p5/m, z31.s \n\t" \
|
||||
"fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \
|
||||
"movprfx z21.s, p5/m, z31.s \n\t" \
|
||||
"fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \
|
||||
"movprfx z19.s, p5/m, z31.s \n\t" \
|
||||
"fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \
|
||||
"movprfx z22.s, p5/m, z31.s \n\t" \
|
||||
"fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \
|
||||
"movprfx z20.s, p5/m, z31.s \n\t" \
|
||||
"fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \
|
||||
"movprfx z23.s, p5/m, z31.s \n\t" \
|
||||
"fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \
|
||||
"fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \
|
||||
"fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \
|
||||
"fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \
|
||||
"fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \
|
||||
"fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \
|
||||
"fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \
|
||||
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
|
||||
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
|
||||
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// MULT_2SPIN_BACKEND
|
||||
#define MULT_2SPIN_2_A64FXf \
|
||||
{ \
|
||||
asm ( \
|
||||
"fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \
|
||||
"fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \
|
||||
"fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \
|
||||
"fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \
|
||||
"fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \
|
||||
"fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \
|
||||
"fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \
|
||||
"fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \
|
||||
"fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \
|
||||
"fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \
|
||||
"fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \
|
||||
"fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \
|
||||
"fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \
|
||||
"fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \
|
||||
"fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \
|
||||
"fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \
|
||||
"fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \
|
||||
"fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \
|
||||
"fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \
|
||||
"fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \
|
||||
"fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \
|
||||
"fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \
|
||||
"fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \
|
||||
"fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// XP_PROJ
|
||||
#define XP_PROJ_A64FXf \
|
||||
{ \
|
||||
asm ( \
|
||||
"fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \
|
||||
"fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \
|
||||
"fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \
|
||||
"fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \
|
||||
"fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \
|
||||
"fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// XP_RECON
|
||||
#define XP_RECON_A64FXf \
|
||||
asm ( \
|
||||
"movprfx z6.s, p5/m, z31.s \n\t" \
|
||||
"fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
|
||||
"movprfx z7.s, p5/m, z31.s \n\t" \
|
||||
"fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
|
||||
"movprfx z8.s, p5/m, z31.s \n\t" \
|
||||
"fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
|
||||
"movprfx z9.s, p5/m, z31.s \n\t" \
|
||||
"fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
|
||||
"movprfx z10.s, p5/m, z31.s \n\t" \
|
||||
"fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
|
||||
"movprfx z11.s, p5/m, z31.s \n\t" \
|
||||
"fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
|
||||
"mov z0.s, p5/m, z18.s \n\t" \
|
||||
"mov z1.s, p5/m, z19.s \n\t" \
|
||||
"mov z2.s, p5/m, z20.s \n\t" \
|
||||
"mov z3.s, p5/m, z21.s \n\t" \
|
||||
"mov z4.s, p5/m, z22.s \n\t" \
|
||||
"mov z5.s, p5/m, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// XP_RECON_ACCUM
|
||||
#define XP_RECON_ACCUM_A64FXf \
|
||||
asm ( \
|
||||
"fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
|
||||
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
|
||||
"fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
|
||||
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
|
||||
"fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
|
||||
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
|
||||
"fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
|
||||
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
|
||||
"fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
|
||||
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
|
||||
"fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
|
||||
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// YP_PROJ
|
||||
#define YP_PROJ_A64FXf \
|
||||
{ \
|
||||
asm ( \
|
||||
"fsub z12.s, p5/m, z12.s, z21.s \n\t" \
|
||||
"fsub z13.s, p5/m, z13.s, z22.s \n\t" \
|
||||
"fsub z14.s, p5/m, z14.s, z23.s \n\t" \
|
||||
"fadd z15.s, p5/m, z15.s, z18.s \n\t" \
|
||||
"fadd z16.s, p5/m, z16.s, z19.s \n\t" \
|
||||
"fadd z17.s, p5/m, z17.s, z20.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// ZP_PROJ
|
||||
#define ZP_PROJ_A64FXf \
|
||||
{ \
|
||||
asm ( \
|
||||
"fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \
|
||||
"fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \
|
||||
"fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \
|
||||
"fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \
|
||||
"fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \
|
||||
"fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// TP_PROJ
|
||||
#define TP_PROJ_A64FXf \
|
||||
{ \
|
||||
asm ( \
|
||||
"fadd z12.s, p5/m, z12.s, z18.s \n\t" \
|
||||
"fadd z13.s, p5/m, z13.s, z19.s \n\t" \
|
||||
"fadd z14.s, p5/m, z14.s, z20.s \n\t" \
|
||||
"fadd z15.s, p5/m, z15.s, z21.s \n\t" \
|
||||
"fadd z16.s, p5/m, z16.s, z22.s \n\t" \
|
||||
"fadd z17.s, p5/m, z17.s, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// XM_PROJ
|
||||
#define XM_PROJ_A64FXf \
|
||||
{ \
|
||||
asm ( \
|
||||
"fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \
|
||||
"fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \
|
||||
"fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \
|
||||
"fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \
|
||||
"fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \
|
||||
"fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// XM_RECON
|
||||
#define XM_RECON_A64FXf \
|
||||
asm ( \
|
||||
"movprfx z6.s, p5/m, z31.s \n\t" \
|
||||
"fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
|
||||
"movprfx z7.s, p5/m, z31.s \n\t" \
|
||||
"fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
|
||||
"movprfx z8.s, p5/m, z31.s \n\t" \
|
||||
"fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
|
||||
"movprfx z9.s, p5/m, z31.s \n\t" \
|
||||
"fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
|
||||
"movprfx z10.s, p5/m, z31.s \n\t" \
|
||||
"fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
|
||||
"movprfx z11.s, p5/m, z31.s \n\t" \
|
||||
"fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
|
||||
"mov z0.s, p5/m, z18.s \n\t" \
|
||||
"mov z1.s, p5/m, z19.s \n\t" \
|
||||
"mov z2.s, p5/m, z20.s \n\t" \
|
||||
"mov z3.s, p5/m, z21.s \n\t" \
|
||||
"mov z4.s, p5/m, z22.s \n\t" \
|
||||
"mov z5.s, p5/m, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// YM_PROJ
|
||||
#define YM_PROJ_A64FXf \
|
||||
{ \
|
||||
asm ( \
|
||||
"fadd z12.s, p5/m, z12.s, z21.s \n\t" \
|
||||
"fadd z13.s, p5/m, z13.s, z22.s \n\t" \
|
||||
"fadd z14.s, p5/m, z14.s, z23.s \n\t" \
|
||||
"fsub z15.s, p5/m, z15.s, z18.s \n\t" \
|
||||
"fsub z16.s, p5/m, z16.s, z19.s \n\t" \
|
||||
"fsub z17.s, p5/m, z17.s, z20.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// ZM_PROJ
|
||||
#define ZM_PROJ_A64FXf \
|
||||
{ \
|
||||
asm ( \
|
||||
"fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \
|
||||
"fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \
|
||||
"fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \
|
||||
"fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \
|
||||
"fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \
|
||||
"fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// TM_PROJ
|
||||
#define TM_PROJ_A64FXf \
|
||||
{ \
|
||||
asm ( \
|
||||
"ptrue p5.s \n\t" \
|
||||
"fsub z12.s, p5/m, z12.s, z18.s \n\t" \
|
||||
"fsub z13.s, p5/m, z13.s, z19.s \n\t" \
|
||||
"fsub z14.s, p5/m, z14.s, z20.s \n\t" \
|
||||
"fsub z15.s, p5/m, z15.s, z21.s \n\t" \
|
||||
"fsub z16.s, p5/m, z16.s, z22.s \n\t" \
|
||||
"fsub z17.s, p5/m, z17.s, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
); \
|
||||
}
|
||||
// XM_RECON_ACCUM
|
||||
#define XM_RECON_ACCUM_A64FXf \
|
||||
asm ( \
|
||||
"fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
|
||||
"fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
|
||||
"fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
|
||||
"fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
|
||||
"fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
|
||||
"fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
|
||||
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
|
||||
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
|
||||
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
|
||||
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
|
||||
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
|
||||
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// YP_RECON_ACCUM
|
||||
#define YP_RECON_ACCUM_A64FXf \
|
||||
asm ( \
|
||||
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
|
||||
"fsub z9.s, p5/m, z9.s, z18.s \n\t" \
|
||||
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
|
||||
"fsub z10.s, p5/m, z10.s, z19.s \n\t" \
|
||||
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
|
||||
"fsub z11.s, p5/m, z11.s, z20.s \n\t" \
|
||||
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
|
||||
"fadd z6.s, p5/m, z6.s, z21.s \n\t" \
|
||||
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
|
||||
"fadd z7.s, p5/m, z7.s, z22.s \n\t" \
|
||||
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
|
||||
"fadd z8.s, p5/m, z8.s, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// YM_RECON_ACCUM
|
||||
#define YM_RECON_ACCUM_A64FXf \
|
||||
asm ( \
|
||||
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
|
||||
"fadd z9.s, p5/m, z9.s, z18.s \n\t" \
|
||||
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
|
||||
"fadd z10.s, p5/m, z10.s, z19.s \n\t" \
|
||||
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
|
||||
"fadd z11.s, p5/m, z11.s, z20.s \n\t" \
|
||||
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
|
||||
"fsub z6.s, p5/m, z6.s, z21.s \n\t" \
|
||||
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
|
||||
"fsub z7.s, p5/m, z7.s, z22.s \n\t" \
|
||||
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
|
||||
"fsub z8.s, p5/m, z8.s, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// ZP_RECON_ACCUM
|
||||
#define ZP_RECON_ACCUM_A64FXf \
|
||||
asm ( \
|
||||
"fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \
|
||||
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
|
||||
"fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \
|
||||
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
|
||||
"fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \
|
||||
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
|
||||
"fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \
|
||||
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
|
||||
"fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \
|
||||
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
|
||||
"fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \
|
||||
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// ZM_RECON_ACCUM
|
||||
#define ZM_RECON_ACCUM_A64FXf \
|
||||
asm ( \
|
||||
"fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \
|
||||
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
|
||||
"fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \
|
||||
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
|
||||
"fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \
|
||||
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
|
||||
"fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \
|
||||
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
|
||||
"fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \
|
||||
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
|
||||
"fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \
|
||||
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// TP_RECON_ACCUM
|
||||
#define TP_RECON_ACCUM_A64FXf \
|
||||
asm ( \
|
||||
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
|
||||
"fadd z6.s, p5/m, z6.s, z18.s \n\t" \
|
||||
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
|
||||
"fadd z7.s, p5/m, z7.s, z19.s \n\t" \
|
||||
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
|
||||
"fadd z8.s, p5/m, z8.s, z20.s \n\t" \
|
||||
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
|
||||
"fadd z9.s, p5/m, z9.s, z21.s \n\t" \
|
||||
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
|
||||
"fadd z10.s, p5/m, z10.s, z22.s \n\t" \
|
||||
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
|
||||
"fadd z11.s, p5/m, z11.s, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// TM_RECON_ACCUM
|
||||
#define TM_RECON_ACCUM_A64FXf \
|
||||
asm ( \
|
||||
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
|
||||
"fsub z6.s, p5/m, z6.s, z18.s \n\t" \
|
||||
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
|
||||
"fsub z7.s, p5/m, z7.s, z19.s \n\t" \
|
||||
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
|
||||
"fsub z8.s, p5/m, z8.s, z20.s \n\t" \
|
||||
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
|
||||
"fsub z9.s, p5/m, z9.s, z21.s \n\t" \
|
||||
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
|
||||
"fsub z10.s, p5/m, z10.s, z22.s \n\t" \
|
||||
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
|
||||
"fsub z11.s, p5/m, z11.s, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// ZERO_PSI
|
||||
#define ZERO_PSI_A64FXf \
|
||||
asm ( \
|
||||
"ptrue p5.s \n\t" \
|
||||
"fmov z0.s , 0 \n\t" \
|
||||
"fmov z1.s , 0 \n\t" \
|
||||
"fmov z2.s , 0 \n\t" \
|
||||
"fmov z3.s , 0 \n\t" \
|
||||
"fmov z4.s , 0 \n\t" \
|
||||
"fmov z5.s , 0 \n\t" \
|
||||
"fmov z6.s , 0 \n\t" \
|
||||
"fmov z7.s , 0 \n\t" \
|
||||
"fmov z8.s , 0 \n\t" \
|
||||
"fmov z9.s , 0 \n\t" \
|
||||
"fmov z10.s , 0 \n\t" \
|
||||
"fmov z11.s , 0 \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
|
||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
|
||||
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \
|
||||
{ \
|
||||
asm ( \
|
||||
"prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
|
||||
"prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
|
||||
"prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
|
||||
: \
|
||||
: [fetchptr] "r" (base) \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
|
||||
); \
|
||||
}
|
||||
// ADD_RESULT_INTERNAL
|
||||
#define ADD_RESULT_INTERNAL_A64FXf \
|
||||
asm ( \
|
||||
"fadd z0.s, p5/m, z0.s, z12.s \n\t" \
|
||||
"fadd z1.s, p5/m, z1.s, z13.s \n\t" \
|
||||
"fadd z2.s, p5/m, z2.s, z14.s \n\t" \
|
||||
"fadd z3.s, p5/m, z3.s, z15.s \n\t" \
|
||||
"fadd z4.s, p5/m, z4.s, z16.s \n\t" \
|
||||
"fadd z5.s, p5/m, z5.s, z17.s \n\t" \
|
||||
"fadd z6.s, p5/m, z6.s, z18.s \n\t" \
|
||||
"fadd z7.s, p5/m, z7.s, z19.s \n\t" \
|
||||
"fadd z8.s, p5/m, z8.s, z20.s \n\t" \
|
||||
"fadd z9.s, p5/m, z9.s, z21.s \n\t" \
|
||||
"fadd z10.s, p5/m, z10.s, z22.s \n\t" \
|
||||
"fadd z11.s, p5/m, z11.s, z23.s \n\t" \
|
||||
: \
|
||||
: \
|
||||
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
|
||||
);
|
||||
|
@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define LOCK_GAUGE(A)
|
||||
#define UNLOCK_GAUGE(A)
|
||||
#define MASK_REGS DECLARATIONS_A64FXd
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A);
|
||||
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
|
||||
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
|
||||
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
|
||||
#define ZERO_PSI ZERO_PSI_A64FXd
|
||||
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
|
||||
#define XP_PROJ XP_PROJ_A64FXd
|
||||
#define YP_PROJ YP_PROJ_A64FXd
|
||||
@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
|
||||
// DECLARATIONS
|
||||
#define DECLARATIONS_A64FXd \
|
||||
uint64_t baseU; \
|
||||
const uint64_t lut[4][8] = { \
|
||||
{4, 5, 6, 7, 0, 1, 2, 3}, \
|
||||
{2, 3, 0, 1, 6, 7, 4, 5}, \
|
||||
@ -126,114 +128,114 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
// RESULT
|
||||
#define RESULT_A64FXd(base) \
|
||||
{ \
|
||||
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \
|
||||
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \
|
||||
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \
|
||||
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \
|
||||
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \
|
||||
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \
|
||||
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \
|
||||
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \
|
||||
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \
|
||||
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \
|
||||
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \
|
||||
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \
|
||||
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \
|
||||
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \
|
||||
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \
|
||||
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \
|
||||
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \
|
||||
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \
|
||||
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \
|
||||
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \
|
||||
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \
|
||||
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \
|
||||
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \
|
||||
svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \
|
||||
}
|
||||
// PREFETCH_CHIMU_L2 (prefetch to L2)
|
||||
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \
|
||||
{ \
|
||||
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \
|
||||
}
|
||||
// PREFETCH_CHIMU_L1 (prefetch to L1)
|
||||
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \
|
||||
{ \
|
||||
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
|
||||
svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \
|
||||
svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \
|
||||
svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \
|
||||
}
|
||||
// PREFETCH_GAUGE_L2 (prefetch to L2)
|
||||
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \
|
||||
{ \
|
||||
const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
|
||||
svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
|
||||
const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \
|
||||
}
|
||||
// PREFETCH_GAUGE_L1 (prefetch to L1)
|
||||
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
|
||||
const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \
|
||||
}
|
||||
// LOAD_CHI
|
||||
#define LOAD_CHI_A64FXd(base) \
|
||||
{ \
|
||||
Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64)); \
|
||||
Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64)); \
|
||||
Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64)); \
|
||||
Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64)); \
|
||||
Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64)); \
|
||||
Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64)); \
|
||||
Chi_00 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(0)); \
|
||||
Chi_01 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(1)); \
|
||||
Chi_02 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(2)); \
|
||||
Chi_10 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(3)); \
|
||||
Chi_11 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(4)); \
|
||||
Chi_12 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(5)); \
|
||||
}
|
||||
// LOAD_CHIMU
|
||||
#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \
|
||||
{ \
|
||||
Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
|
||||
Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \
|
||||
Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
|
||||
Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
|
||||
Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
|
||||
Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \
|
||||
Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
|
||||
Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
|
||||
Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
|
||||
Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \
|
||||
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
|
||||
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
|
||||
Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
|
||||
Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
|
||||
Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
|
||||
Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
|
||||
Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
|
||||
Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
|
||||
Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
|
||||
Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
|
||||
Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
|
||||
Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
|
||||
Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
|
||||
Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
|
||||
}
|
||||
// LOAD_CHIMU_0213
|
||||
#define LOAD_CHIMU_0213_A64FXd \
|
||||
{ \
|
||||
const SiteSpinor & ref(in[offset]); \
|
||||
Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
|
||||
Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
|
||||
Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
|
||||
Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
|
||||
Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
|
||||
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
|
||||
Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
|
||||
Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \
|
||||
Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
|
||||
Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \
|
||||
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
|
||||
Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \
|
||||
Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
|
||||
Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
|
||||
Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
|
||||
Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
|
||||
Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
|
||||
Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
|
||||
Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
|
||||
Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
|
||||
Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
|
||||
Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
|
||||
Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
|
||||
Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
|
||||
}
|
||||
// LOAD_CHIMU_0312
|
||||
#define LOAD_CHIMU_0312_A64FXd \
|
||||
{ \
|
||||
const SiteSpinor & ref(in[offset]); \
|
||||
Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
|
||||
Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \
|
||||
Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
|
||||
Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \
|
||||
Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
|
||||
Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \
|
||||
Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
|
||||
Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
|
||||
Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
|
||||
Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
|
||||
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
|
||||
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
|
||||
Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
|
||||
Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
|
||||
Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
|
||||
Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
|
||||
Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
|
||||
Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
|
||||
Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
|
||||
Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
|
||||
Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
|
||||
Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
|
||||
Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
|
||||
Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
|
||||
}
|
||||
// LOAD_TABLE0
|
||||
#define LOAD_TABLE0 \
|
||||
@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
Chi_12 = svtbl(Chi_12, table0);
|
||||
|
||||
// LOAD_GAUGE
|
||||
#define LOAD_GAUGE \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
#define LOAD_GAUGE(A) \
|
||||
{ \
|
||||
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
|
||||
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
|
||||
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
|
||||
U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
|
||||
U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
|
||||
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
|
||||
const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
|
||||
U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \
|
||||
U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \
|
||||
U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \
|
||||
U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \
|
||||
U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \
|
||||
U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \
|
||||
}
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_1_A64FXd(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
|
||||
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
|
||||
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
|
||||
U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
|
||||
U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
|
||||
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
|
||||
const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
|
||||
U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \
|
||||
U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \
|
||||
U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \
|
||||
U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \
|
||||
U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \
|
||||
U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \
|
||||
UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
|
||||
UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
|
||||
UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
|
||||
@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
|
||||
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
|
||||
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
|
||||
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
|
||||
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
|
||||
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
|
||||
U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \
|
||||
U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \
|
||||
U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \
|
||||
}
|
||||
// MULT_2SPIN_BACKEND
|
||||
#define MULT_2SPIN_2_A64FXd \
|
||||
@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
result_31 = svdup_f64(0.); \
|
||||
result_32 = svdup_f64(0.);
|
||||
|
||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
|
||||
// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
|
||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \
|
||||
{ \
|
||||
svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
|
||||
asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \
|
||||
asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \
|
||||
asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \
|
||||
}
|
||||
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
|
||||
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \
|
||||
|
@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define LOCK_GAUGE(A)
|
||||
#define UNLOCK_GAUGE(A)
|
||||
#define MASK_REGS DECLARATIONS_A64FXf
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
|
||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A);
|
||||
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
|
||||
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
|
||||
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
|
||||
#define ZERO_PSI ZERO_PSI_A64FXf
|
||||
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
|
||||
#define XP_PROJ XP_PROJ_A64FXf
|
||||
#define YP_PROJ YP_PROJ_A64FXf
|
||||
@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
|
||||
// DECLARATIONS
|
||||
#define DECLARATIONS_A64FXf \
|
||||
uint64_t baseU; \
|
||||
const uint32_t lut[4][16] = { \
|
||||
{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
|
||||
{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
|
||||
@ -126,114 +128,114 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
// RESULT
|
||||
#define RESULT_A64FXf(base) \
|
||||
{ \
|
||||
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \
|
||||
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \
|
||||
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \
|
||||
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \
|
||||
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \
|
||||
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \
|
||||
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \
|
||||
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \
|
||||
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \
|
||||
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \
|
||||
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \
|
||||
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \
|
||||
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \
|
||||
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \
|
||||
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \
|
||||
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \
|
||||
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \
|
||||
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \
|
||||
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \
|
||||
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \
|
||||
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \
|
||||
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \
|
||||
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \
|
||||
svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \
|
||||
}
|
||||
// PREFETCH_CHIMU_L2 (prefetch to L2)
|
||||
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \
|
||||
{ \
|
||||
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \
|
||||
}
|
||||
// PREFETCH_CHIMU_L1 (prefetch to L1)
|
||||
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \
|
||||
{ \
|
||||
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
|
||||
svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \
|
||||
svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \
|
||||
svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \
|
||||
}
|
||||
// PREFETCH_GAUGE_L2 (prefetch to L2)
|
||||
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \
|
||||
{ \
|
||||
const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
|
||||
svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
|
||||
const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \
|
||||
}
|
||||
// PREFETCH_GAUGE_L1 (prefetch to L1)
|
||||
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
|
||||
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
|
||||
const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \
|
||||
svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \
|
||||
}
|
||||
// LOAD_CHI
|
||||
#define LOAD_CHI_A64FXf(base) \
|
||||
{ \
|
||||
Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64)); \
|
||||
Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64)); \
|
||||
Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64)); \
|
||||
Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64)); \
|
||||
Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64)); \
|
||||
Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64)); \
|
||||
Chi_00 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(0)); \
|
||||
Chi_01 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(1)); \
|
||||
Chi_02 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(2)); \
|
||||
Chi_10 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(3)); \
|
||||
Chi_11 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(4)); \
|
||||
Chi_12 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(5)); \
|
||||
}
|
||||
// LOAD_CHIMU
|
||||
#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \
|
||||
{ \
|
||||
Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
|
||||
Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \
|
||||
Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
|
||||
Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
|
||||
Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
|
||||
Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \
|
||||
Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
|
||||
Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
|
||||
Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
|
||||
Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \
|
||||
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
|
||||
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
|
||||
Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
|
||||
Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
|
||||
Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
|
||||
Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
|
||||
Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
|
||||
Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
|
||||
Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
|
||||
Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
|
||||
Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
|
||||
Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
|
||||
Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
|
||||
Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
|
||||
}
|
||||
// LOAD_CHIMU_0213
|
||||
#define LOAD_CHIMU_0213_A64FXf \
|
||||
{ \
|
||||
const SiteSpinor & ref(in[offset]); \
|
||||
Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
|
||||
Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
|
||||
Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
|
||||
Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
|
||||
Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
|
||||
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
|
||||
Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
|
||||
Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \
|
||||
Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
|
||||
Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \
|
||||
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
|
||||
Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \
|
||||
Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
|
||||
Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
|
||||
Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
|
||||
Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
|
||||
Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
|
||||
Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
|
||||
Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
|
||||
Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
|
||||
Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
|
||||
Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
|
||||
Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
|
||||
Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
|
||||
}
|
||||
// LOAD_CHIMU_0312
|
||||
#define LOAD_CHIMU_0312_A64FXf \
|
||||
{ \
|
||||
const SiteSpinor & ref(in[offset]); \
|
||||
Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
|
||||
Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \
|
||||
Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
|
||||
Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \
|
||||
Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
|
||||
Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \
|
||||
Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
|
||||
Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
|
||||
Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
|
||||
Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
|
||||
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
|
||||
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
|
||||
Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \
|
||||
Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \
|
||||
Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \
|
||||
Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \
|
||||
Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \
|
||||
Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \
|
||||
Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \
|
||||
Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \
|
||||
Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \
|
||||
Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \
|
||||
Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \
|
||||
Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \
|
||||
}
|
||||
// LOAD_TABLE0
|
||||
#define LOAD_TABLE0 \
|
||||
@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
Chi_12 = svtbl(Chi_12, table0);
|
||||
|
||||
// LOAD_GAUGE
|
||||
#define LOAD_GAUGE \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
#define LOAD_GAUGE(A) \
|
||||
{ \
|
||||
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
|
||||
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
|
||||
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
|
||||
U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
|
||||
U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
|
||||
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
|
||||
const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
|
||||
U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \
|
||||
U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \
|
||||
U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \
|
||||
U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \
|
||||
U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \
|
||||
U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \
|
||||
}
|
||||
// MULT_2SPIN
|
||||
#define MULT_2SPIN_1_A64FXf(A) \
|
||||
{ \
|
||||
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
|
||||
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
|
||||
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
|
||||
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
|
||||
U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
|
||||
U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
|
||||
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
|
||||
const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
|
||||
U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \
|
||||
U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \
|
||||
U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \
|
||||
U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \
|
||||
U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \
|
||||
U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \
|
||||
UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
|
||||
UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
|
||||
UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
|
||||
@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
|
||||
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
|
||||
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
|
||||
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
|
||||
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
|
||||
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
|
||||
U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \
|
||||
U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \
|
||||
U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \
|
||||
}
|
||||
// MULT_2SPIN_BACKEND
|
||||
#define MULT_2SPIN_2_A64FXf \
|
||||
@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
result_31 = svdup_f32(0.); \
|
||||
result_32 = svdup_f32(0.);
|
||||
|
||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
|
||||
// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
|
||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \
|
||||
{ \
|
||||
svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
|
||||
svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
|
||||
asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \
|
||||
asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \
|
||||
asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \
|
||||
}
|
||||
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
|
||||
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \
|
||||
|
@ -46,6 +46,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
||||
#undef MULT_2SPIN_2
|
||||
#undef MAYBEPERM
|
||||
#undef LOAD_CHI
|
||||
#undef ZERO_PSI
|
||||
#undef XP_PROJ
|
||||
#undef YP_PROJ
|
||||
#undef ZP_PROJ
|
||||
|
@ -38,12 +38,20 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#ifdef GRID_HIP
|
||||
#include <hip/hip_fp16.h>
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
namespace Grid {
|
||||
typedef struct { uint16_t x;} half;
|
||||
typedef struct { half x; half y;} half2;
|
||||
typedef struct { float x; float y;} float2;
|
||||
typedef struct { double x; double y;} double2;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
namespace Grid {
|
||||
|
||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
||||
typedef struct { uint16_t x;} half;
|
||||
#endif
|
||||
|
||||
|
||||
typedef struct Half2_t { half x; half y; } Half2;
|
||||
|
||||
#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
|
||||
@ -156,7 +164,7 @@ accelerator_inline float half2float(half h)
|
||||
f = __half2float(h);
|
||||
#else
|
||||
Grid_half hh;
|
||||
hh.x = hr.x;
|
||||
hh.x = h.x;
|
||||
f= sfw_half_to_float(hh);
|
||||
#endif
|
||||
return f;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -24,7 +24,7 @@ typedef typename GparityDomainWallFermionD::FermionField GparityLatticeFermionD;
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
#ifdef ENABLE_GPARITY
|
||||
int Ls=16;
|
||||
for(int i=0;i<argc;i++)
|
||||
if(std::string(argv[i]) == "-Ls"){
|
||||
@ -184,7 +184,7 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
||||
DwD.Report();
|
||||
}
|
||||
|
||||
#endif
|
||||
Grid_finalize();
|
||||
}
|
||||
|
||||
|
18
configure.ac
18
configure.ac
@ -123,6 +123,24 @@ case ${ac_LAPACK} in
|
||||
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
|
||||
esac
|
||||
|
||||
############### fermions
|
||||
AC_ARG_ENABLE([fermion-reps],
|
||||
[AC_HELP_STRING([--fermion-reps=yes|no], [enable extra fermion representation support])],
|
||||
[ac_FERMION_REPS=${enable_fermion_reps}], [ac_FERMION_REPS=yes])
|
||||
|
||||
AM_CONDITIONAL(BUILD_FERMION_REPS, [ test "${ac_FERMION_REPS}X" == "yesX" ])
|
||||
|
||||
AC_ARG_ENABLE([gparity],
|
||||
[AC_HELP_STRING([--enable-gparity=yes|no], [enable G-parity support])],
|
||||
[ac_GPARITY=${enable_gparity}], [ac_GPARITY=yes])
|
||||
|
||||
AM_CONDITIONAL(BUILD_GPARITY, [ test "${ac_GPARITY}X" == "yesX" ])
|
||||
case ${ac_FERMION_REPS} in
|
||||
yes) AC_DEFINE([ENABLE_FERMION_REPS],[1],[non QCD fermion reps]);;
|
||||
esac
|
||||
case ${ac_GPARITY} in
|
||||
yes) AC_DEFINE([ENABLE_GPARITY],[1],[fermion actions with GPARITY BCs]);;
|
||||
esac
|
||||
############### Nc
|
||||
AC_ARG_ENABLE([Nc],
|
||||
[AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
|
||||
|
@ -6,13 +6,27 @@ home=`pwd`
|
||||
cd $home/Grid
|
||||
HFILES=`find . -type f -name '*.h' -not -name '*Hdf5*' -not -path '*/gamma-gen/*' -not -path '*/Old/*' -not -path '*/Eigen/*'`
|
||||
HFILES="$HFILES"
|
||||
CCFILES=`find . -name '*.cc' -not -path '*/gamma-gen/*' -not -name '*Communicator*.cc' -not -name '*SharedMemory*.cc' -not -name '*Hdf5*'`
|
||||
CCFILES=`find . -name '*.cc' -not -path '*/instantiation/*/*' -not -path '*/gamma-gen/*' -not -name '*Communicator*.cc' -not -name '*SharedMemory*.cc' -not -name '*Hdf5*'`
|
||||
|
||||
|
||||
ZWILS_FERMION_FILES=` find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/ZWilsonImpl*' `
|
||||
WILS_FERMION_FILES=` find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/WilsonImpl*' `
|
||||
STAG_FERMION_FILES=` find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/Staggered*' `
|
||||
GP_FERMION_FILES=` find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/Gparity*' `
|
||||
ADJ_FERMION_FILES=` find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/WilsonAdj*' `
|
||||
TWOIND_FERMION_FILES=`find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/WilsonTwoIndex*'`
|
||||
|
||||
HPPFILES=`find . -type f -name '*.hpp'`
|
||||
echo HFILES=$HFILES $HPPFILES > Make.inc
|
||||
echo >> Make.inc
|
||||
echo CCFILES=$CCFILES >> Make.inc
|
||||
|
||||
|
||||
echo ZWILS_FERMION_FILES=$ZWILS_FERMION_FILES >> Make.inc
|
||||
echo WILS_FERMION_FILES=$WILS_FERMION_FILES >> Make.inc
|
||||
echo STAG_FERMION_FILES=$STAG_FERMION_FILES >> Make.inc
|
||||
echo GP_FERMION_FILES=$GP_FERMION_FILES >> Make.inc
|
||||
echo ADJ_FERMION_FILES=$ADJ_FERMION_FILES >> Make.inc
|
||||
echo TWOIND_FERMION_FILES=$TWOIND_FERMION_FILES >> Make.inc
|
||||
|
||||
# tests Make.inc
|
||||
cd $home/tests
|
||||
|
@ -102,7 +102,8 @@ int main (int argc, char ** argv)
|
||||
LatticeComplexD detUU(grid);
|
||||
|
||||
detU= Determinant(U) ;
|
||||
std::cout << "Determinant before screw up " <<detU<<std::endl;
|
||||
detU=detU-1.0;
|
||||
std::cout << "Determinant before screw up " << norm2(detU)<<std::endl;
|
||||
|
||||
std::cout << " Screwing up determinant " << std::endl;
|
||||
|
||||
@ -116,18 +117,24 @@ int main (int argc, char ** argv)
|
||||
UU=U;
|
||||
|
||||
detU= Determinant(U) ;
|
||||
std::cout << "Determinant after screw up " <<detU<<std::endl;
|
||||
|
||||
detU=detU-1.0;
|
||||
std::cout << "Determinant defect before projection " <<norm2(detU)<<std::endl;
|
||||
tmp = U*adj(U) - ident;
|
||||
std::cout << "Unitarity check before projection " << norm2(tmp)<<std::endl;
|
||||
|
||||
ProjectSU3(U);
|
||||
detU= Determinant(U) ;
|
||||
std::cout << "Determinant ProjectSU3 " <<detU<<std::endl;
|
||||
detU= detU -1.0;
|
||||
std::cout << "Determinant ProjectSU3 defect " <<norm2(detU)<<std::endl;
|
||||
tmp = U*adj(U) - ident;
|
||||
std::cout << "Unitarity check after projection " << norm2(tmp)<<std::endl;
|
||||
|
||||
ProjectSUn<3>(UU);
|
||||
ProjectSUn(UU);
|
||||
detUU= Determinant(UU);
|
||||
std::cout << "Determinant ProjectSUn " <<detUU<<std::endl;
|
||||
detUU= detUU -1.0;
|
||||
std::cout << "Determinant ProjectSUn defect " <<norm2(detUU)<<std::endl;
|
||||
tmp = UU*adj(UU) - ident;
|
||||
std::cout << "Unitarity check after projection " << norm2(tmp)<<std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
@ -108,8 +108,18 @@ int main (int argc, char ** argv)
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
SU<Nc>::ColdConfiguration(Umu);
|
||||
// SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||
if( argc > 1 && argv[1][0] != '-' )
|
||||
{
|
||||
std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
|
||||
FieldMetaData header;
|
||||
NerscIO::readConfiguration(Umu, header, argv[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout<<GridLogMessage <<"Using cold configuration"<<std::endl;
|
||||
SU<Nc>::ColdConfiguration(Umu);
|
||||
// SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||
}
|
||||
|
||||
RealD mass=0.3;
|
||||
RealD M5 =1.0;
|
||||
|
Loading…
x
Reference in New Issue
Block a user