1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-18 15:57:05 +01:00

Compare commits

..

5 Commits

231 changed files with 1736 additions and 12740 deletions

View File

@ -9,6 +9,11 @@ matrix:
- os: osx - os: osx
osx_image: xcode8.3 osx_image: xcode8.3
compiler: clang compiler: clang
env: PREC=single
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=double
before_install: before_install:
- export GRIDDIR=`pwd` - export GRIDDIR=`pwd`
@ -50,7 +55,7 @@ script:
- make -j4 - make -j4
- make install - make install
- cd $CWD/build - cd $CWD/build
- ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF} - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
- make -j4 - make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check - make check

View File

@ -34,12 +34,6 @@
#define __SYCL__REDEFINE__ #define __SYCL__REDEFINE__
#endif #endif
/* HIP save and restore compile environment*/
#ifdef GRID_HIP
#pragma push
#pragma push_macro("__HIP_DEVICE_COMPILE__")
#endif
#define EIGEN_NO_HIP
#include <Grid/Eigen/Dense> #include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor> #include <Grid/Eigen/unsupported/CXX11/Tensor>
@ -48,7 +42,7 @@
#ifdef __NVCC__REDEFINE__ #ifdef __NVCC__REDEFINE__
#pragma pop_macro("__CUDACC__") #pragma pop_macro("__CUDACC__")
#pragma pop_macro("__NVCC__") #pragma pop_macro("__NVCC__")
#pragma pop_macro("__CUDA_ARCH__") #pragma pop_macro("GRID_SIMT")
#pragma pop #pragma pop
#endif #endif
@ -58,12 +52,6 @@
#pragma pop #pragma pop
#endif #endif
/*HIP restore*/
#ifdef __HIP__REDEFINE__
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
#pragma pop
#endif
#if defined __GNUC__ #if defined __GNUC__
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#endif #endif

View File

@ -65,7 +65,8 @@ public:
MemoryManager::CpuFree((void *)__p,bytes); MemoryManager::CpuFree((void *)__p,bytes);
} }
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop // FIXME: hack for the copy constructor, eventually it must be avoided
//void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
void construct(pointer __p, const _Tp& __val) { assert(0);}; void construct(pointer __p, const _Tp& __val) { assert(0);};
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
@ -73,9 +74,6 @@ public:
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////////////
// Unified virtual memory
//////////////////////////////////////////////////////////////////////////////////////
template<typename _Tp> template<typename _Tp>
class uvmAllocator { class uvmAllocator {
public: public:
@ -111,63 +109,22 @@ public:
MemoryManager::SharedFree((void *)__p,bytes); MemoryManager::SharedFree((void *)__p,bytes);
} }
// FIXME: hack for the copy constructor, eventually it must be avoided
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
//void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; } template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; } template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
////////////////////////////////////////////////////////////////////////////////
// Device memory
////////////////////////////////////////////////////////////////////////////////
template<typename _Tp>
class devAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef devAllocator<_Tp1> other; };
devAllocator() throw() { }
devAllocator(const devAllocator&) throw() { }
template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
~devAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
MemoryManager::AcceleratorFree((void *)__p,bytes);
}
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//template<class T> using commAllocator = devAllocator<T>; template<class T> using commAllocator = uvmAllocator<T>;
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >; template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
//template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -136,15 +136,6 @@ void MemoryManager::Init(void)
Ncache[SharedSmall]=Nc; Ncache[SharedSmall]=Nc;
} }
} }
}
void MemoryManager::InitMessage(void) {
#ifndef GRID_UVM
std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
#endif
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl; std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE #ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl; std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
@ -173,7 +164,6 @@ void MemoryManager::InitMessage(void) {
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl; std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
#endif #endif
#endif #endif
} }
void *MemoryManager::Insert(void *ptr,size_t bytes,int type) void *MemoryManager::Insert(void *ptr,size_t bytes,int type)

View File

@ -93,12 +93,11 @@ private:
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ; static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ; static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
static void *AcceleratorAllocate(size_t bytes);
static void AcceleratorFree (void *ptr,size_t bytes);
static void PrintBytes(void); static void PrintBytes(void);
public: public:
static void Init(void); static void Init(void);
static void InitMessage(void);
static void *AcceleratorAllocate(size_t bytes);
static void AcceleratorFree (void *ptr,size_t bytes);
static void *SharedAllocate(size_t bytes); static void *SharedAllocate(size_t bytes);
static void SharedFree (void *ptr,size_t bytes); static void SharedFree (void *ptr,size_t bytes);
static void *CpuAllocate(size_t bytes); static void *CpuAllocate(size_t bytes);

View File

@ -138,6 +138,21 @@ public:
int recv_from_rank, int recv_from_rank,
int bytes); int bytes);
void SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,

View File

@ -43,16 +43,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) { if ( !flag ) {
#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
nCommThreads=1;
// wrong results here too
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
// other comms schemes are ok
MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
#else
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
#endif
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
assert(0); assert(0);
@ -302,28 +294,60 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int bytes) int bytes)
{ {
std::vector<CommsRequest_t> reqs(0); std::vector<CommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0); // unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0); // unsigned long rcrc = crc32(0L, Z_NULL, 0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
int myrank = _processor; int myrank = _processor;
int ierr; int ierr;
// Enforce no UVM in comms, device or host OK if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
assert(acceleratorIsCommunicable(xmit)); MPI_Request xrq;
assert(acceleratorIsCommunicable(recv)); MPI_Request rrq;
// Give the CPU to MPI immediately; can use threads to overlap optionally ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); assert(ierr==0);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes); list.push_back(xrq);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
} }
// Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest, int dest,
void *recv, void *recv,
@ -379,7 +403,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
return off_node_bytes; return off_node_bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{ {
int nreq=list.size(); int nreq=list.size();
@ -390,13 +422,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
assert(ierr==0); assert(ierr==0);
list.resize(0); list.resize(0);
} }
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
//{
//}
void CartesianCommunicator::Barrier(void) void CartesianCommunicator::Barrier(void)
{ {
int ierr = MPI_Barrier(communicator); int ierr = MPI_Barrier(communicator);
@ -458,3 +483,5 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -77,6 +77,15 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint32_t &){}
void CartesianCommunicator::GlobalXOR(uint64_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes)
{
assert(0);
}
// Basic Halo comms primitive -- should never call in single node // Basic Halo comms primitive -- should never call in single node
void CartesianCommunicator::SendToRecvFrom(void *xmit, void CartesianCommunicator::SendToRecvFrom(void *xmit,
@ -87,6 +96,20 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{ {
assert(0); assert(0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
assert(0);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
assert(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
bcopy(in,out,bytes*words); bcopy(in,out,bytes*words);
@ -114,6 +137,10 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int recv_from_rank, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes; return 2.0*bytes;
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@ -123,10 +150,13 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int recv_from_rank, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {
SendToRecvFromComplete(waitall);
} }
void CartesianCommunicator::StencilBarrier(void){}; void CartesianCommunicator::StencilBarrier(void){};

View File

@ -32,9 +32,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_CUDA #ifdef GRID_CUDA
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#endif #endif
#ifdef GRID_HIP
#include <hip/hip_runtime_api.h>
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: " #define header "SharedMemoryMpi: "
@ -50,12 +47,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Split into groups that can share memory // Split into groups that can share memory
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
#ifndef GRID_MPI3_SHM_NONE
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
#else
MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm);
#endif
MPI_Comm_rank(WorldShmComm ,&WorldShmRank); MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize); MPI_Comm_size(WorldShmComm ,&WorldShmSize);
@ -428,7 +420,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended // Hugetlbfs mapping intended
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#if defined(GRID_CUDA) ||defined(GRID_HIP) #ifdef GRID_CUDA
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@ -451,15 +443,17 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes); auto err = cudaMalloc(&ShmCommBuf, bytes);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if ( WorldRank == 0 ){ if ( WorldRank == 0 ){
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
@ -468,30 +462,18 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
#ifndef GRID_MPI3_SHM_NONE
////////////////////////////////////////////////// //////////////////////////////////////////////////
// If it is me, pass around the IPC access key // If it is me, pass around the IPC access key
////////////////////////////////////////////////// //////////////////////////////////////////////////
#ifdef GRID_CUDA
cudaIpcMemHandle_t handle; cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) { if ( r==WorldShmRank ) {
auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf); err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != cudaSuccess) { if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif
#ifdef GRID_HIP
hipIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
////////////////////////////////////////////////// //////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm // Share this IPC handle across the Shm Comm
////////////////////////////////////////////////// //////////////////////////////////////////////////
@ -508,31 +490,17 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// If I am not the source, overwrite thisBuf with remote buffer // If I am not the source, overwrite thisBuf with remote buffer
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf; void * thisBuf = ShmCommBuf;
#ifdef GRID_CUDA
if ( r!=WorldShmRank ) { if ( r!=WorldShmRank ) {
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess); err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
if ( err != cudaSuccess) { if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif
#ifdef GRID_HIP
if ( r!=WorldShmRank ) {
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Save a copy of the device buffers // Save a copy of the device buffers
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
WorldShmCommBufs[r] = thisBuf; WorldShmCommBufs[r] = thisBuf;
#else
WorldShmCommBufs[r] = ShmCommBuf;
#endif
} }
_ShmAllocBytes=bytes; _ShmAllocBytes=bytes;
@ -737,11 +705,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Split into groups that can share memory // Split into groups that can share memory
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
#ifndef GRID_MPI3_SHM_NONE
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
#else
MPI_Comm_split(comm, rank, 0, &ShmComm);
#endif
MPI_Comm_rank(ShmComm ,&ShmRank); MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize); MPI_Comm_size(ShmComm ,&ShmSize);
ShmCommBufs.resize(ShmSize); ShmCommBufs.resize(ShmSize);

View File

@ -52,8 +52,23 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> template<typename Op, typename T1>
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{
return Cshift(closure(expr),dim,shift);
}
template <class Op, class T1, class T2>
auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
{
return Cshift(closure(expr),dim,shift);
}
template <class Op, class T1, class T2, class T3>
auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, expr.arg2),
eval(0, expr.arg3)))>
{ {
return Cshift(closure(expr),dim,shift); return Cshift(closure(expr),dim,shift);
} }

View File

@ -76,8 +76,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = &Cshift_table[0]; auto table = &Cshift_table[0];
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,1,{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); buffer_p[table[i].first]=rhs_v[table[i].second];
}); });
} }
} }
@ -185,8 +185,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
autoView( rhs_v, rhs, AcceleratorWrite); autoView( rhs_v, rhs, AcceleratorWrite);
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = &Cshift_table[0]; auto table = &Cshift_table[0];
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,1,{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); rhs_v[table[i].first]=buffer_p[table[i].second];
}); });
} }
} }
@ -209,11 +209,9 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
autoView( rhs_v , rhs, AcceleratorWrite); autoView( rhs_v , rhs, AcceleratorWrite);
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension];
accelerator_for2d(n,e1,b,e2,1,{ accelerator_for2d(n,e1,b,e2,1,{
int o = n*_slice_stride; int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*_slice_block; int offset = b+n*rhs.Grid()->_slice_block[dimension];
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
}); });
} else { } else {
@ -222,7 +220,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
// Test_cshift_red_black code. // Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
assert(0); // This will fail if hit on GPU
autoView( rhs_v, rhs, CpuWrite); autoView( rhs_v, rhs, CpuWrite);
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@ -283,8 +280,8 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite); autoView(lhs_v , lhs, AcceleratorWrite);
auto table = &Cshift_table[0]; auto table = &Cshift_table[0];
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,1,{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); lhs_v[table[i].first]=rhs_v[table[i].second];
}); });
} }
} }

View File

@ -37,7 +37,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_reduction.h> #include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h> #include <Grid/lattice/Lattice_peekpoke.h>
//#include <Grid/lattice/Lattice_reality.h> //#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_real_imag.h>
#include <Grid/lattice/Lattice_comparison_utils.h> #include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h> #include <Grid/lattice/Lattice_comparison.h>
#include <Grid/lattice/Lattice_coordinate.h> #include <Grid/lattice/Lattice_coordinate.h>

View File

@ -42,24 +42,9 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Predicated where support // Predicated where support
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
#ifdef GRID_SIMT
// drop to scalar in SIMT; cleaner in fact
template <class iobj, class vobj, class robj> template <class iobj, class vobj, class robj>
accelerator_inline vobj predicatedWhere(const iobj &predicate, accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
const vobj &iftrue, const robj &iffalse) {
const robj &iffalse)
{
Integer mask = TensorRemove(predicate);
typename std::remove_const<vobj>::type ret= iffalse;
if (mask) ret=iftrue;
return ret;
}
#else
template <class iobj, class vobj, class robj>
accelerator_inline vobj predicatedWhere(const iobj &predicate,
const vobj &iftrue,
const robj &iffalse)
{
typename std::remove_const<vobj>::type ret; typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
@ -83,7 +68,6 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
merge(ret, falsevals); merge(ret, falsevals);
return ret; return ret;
} }
#endif
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
//Specialization of getVectorType for lattices //Specialization of getVectorType for lattices
@ -97,62 +81,32 @@ struct getVectorType<Lattice<T> >{
//-- recursive evaluation of expressions; -- //-- recursive evaluation of expressions; --
// handle leaves of syntax tree // handle leaves of syntax tree
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template<class sobj, template<class sobj> accelerator_inline
typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr>
accelerator_inline
sobj eval(const uint64_t ss, const sobj &arg) sobj eval(const uint64_t ss, const sobj &arg)
{ {
return arg; return arg;
} }
template <class lobj> accelerator_inline
auto eval(const uint64_t ss, const LatticeView<lobj> &arg) -> decltype(arg(ss))
{
return arg(ss);
}
////////////////////////////////////////////
//-- recursive evaluation of expressions; --
// whole vector return, used only for expression return type inference
///////////////////////////////////////////////////
template<class sobj> accelerator_inline
sobj vecEval(const uint64_t ss, const sobj &arg)
{
return arg;
}
template <class lobj> accelerator_inline template <class lobj> accelerator_inline
const lobj & vecEval(const uint64_t ss, const LatticeView<lobj> &arg) const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
{ {
return arg[ss]; return arg[ss];
} }
/////////////////////////////////////////////////// // What needs this?
// handle nodes in syntax tree- eval one operand // Cannot be legal on accelerator
// vecEval needed (but never called as all expressions offloaded) to infer the return type // Comparison must convert
// in SIMT contexts of closure. #if 1
/////////////////////////////////////////////////// template <class lobj> accelerator_inline
template <typename Op, typename T1> accelerator_inline const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
auto vecEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
-> decltype(expr.op.func( vecEval(ss, expr.arg1)))
{ {
return expr.op.func( vecEval(ss, expr.arg1) ); auto view = arg.View(AcceleratorRead);
} return view[ss];
// vecEval two operands
template <typename Op, typename T1, typename T2> accelerator_inline
auto vecEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
-> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2)))
{
return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) );
}
// vecEval three operands
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
auto vecEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)))
{
return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3));
} }
#endif
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand coalesced // handle nodes in syntax tree- eval one operand
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <typename Op, typename T1> accelerator_inline template <typename Op, typename T1> accelerator_inline
auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr) auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
@ -160,41 +114,23 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
{ {
return expr.op.func( eval(ss, expr.arg1) ); return expr.op.func( eval(ss, expr.arg1) );
} }
///////////////////////
// eval two operands // eval two operands
///////////////////////
template <typename Op, typename T1, typename T2> accelerator_inline template <typename Op, typename T1, typename T2> accelerator_inline
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr) auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2))) -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
{ {
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) ); return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
} }
///////////////////////
// eval three operands // eval three operands
///////////////////////
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> decltype(expr.op.func(eval(ss, expr.arg1), -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
eval(ss, expr.arg2),
eval(ss, expr.arg3)))
{ {
#ifdef GRID_SIMT return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
// Handles Nsimd (vInteger) != Nsimd(ComplexD)
typedef decltype(vecEval(ss, expr.arg2)) rvobj;
typedef typename std::remove_reference<rvobj>::type vobj;
const int Nsimd = vobj::vector_type::Nsimd();
auto vpred = vecEval(ss,expr.arg1);
ExtractBuffer<Integer> mask(Nsimd);
extract<vInteger, Integer>(TensorRemove(vpred), mask);
int s = acceleratorSIMTlane(Nsimd);
return expr.op.func(mask[s],
eval(ss, expr.arg2),
eval(ss, expr.arg3));
#else
return expr.op.func(eval(ss, expr.arg1),
eval(ss, expr.arg2),
eval(ss, expr.arg3));
#endif
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@ -292,7 +228,7 @@ template <typename Op, typename T1, typename T2> inline
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr) void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
{ {
ExpressionViewOpen(expr.arg1); // recurse AST ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // rrecurse AST ExpressionViewOpen(expr.arg2); // recurse AST
} }
template <typename Op, typename T1, typename T2, typename T3> template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr) inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
@ -336,8 +272,9 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
// Unary operators and funcs // Unary operators and funcs
//////////////////////////////////////////// ////////////////////////////////////////////
#define GridUnopClass(name, ret) \ #define GridUnopClass(name, ret) \
template <class arg> \
struct name { \ struct name { \
template<class _arg> static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \ static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
}; };
GridUnopClass(UnarySub, -a); GridUnopClass(UnarySub, -a);
@ -348,6 +285,8 @@ GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a)); GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a)); GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryReal, real(a));
GridUnopClass(UnaryImag, imag(a));
GridUnopClass(UnaryToReal, toReal(a)); GridUnopClass(UnaryToReal, toReal(a));
GridUnopClass(UnaryToComplex, toComplex(a)); GridUnopClass(UnaryToComplex, toComplex(a));
GridUnopClass(UnaryTimesI, timesI(a)); GridUnopClass(UnaryTimesI, timesI(a));
@ -366,10 +305,10 @@ GridUnopClass(UnaryExp, exp(a));
// Binary operators // Binary operators
//////////////////////////////////////////// ////////////////////////////////////////////
#define GridBinOpClass(name, combination) \ #define GridBinOpClass(name, combination) \
template <class left, class right> \
struct name { \ struct name { \
template <class _left, class _right> \
static auto accelerator_inline \ static auto accelerator_inline \
func(const _left &lhs, const _right &rhs) \ func(const left &lhs, const right &rhs) \
-> decltype(combination) const \ -> decltype(combination) const \
{ \ { \
return combination; \ return combination; \
@ -389,10 +328,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
// Trinary conditional op // Trinary conditional op
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
#define GridTrinOpClass(name, combination) \ #define GridTrinOpClass(name, combination) \
template <class predicate, class left, class right> \
struct name { \ struct name { \
template <class _predicate,class _left, class _right> \
static auto accelerator_inline \ static auto accelerator_inline \
func(const _predicate &pred, const _left &lhs, const _right &rhs) \ func(const predicate &pred, const left &lhs, const right &rhs) \
-> decltype(combination) const \ -> decltype(combination) const \
{ \ { \
return combination; \ return combination; \
@ -400,17 +339,17 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
}; };
GridTrinOpClass(TrinaryWhere, GridTrinOpClass(TrinaryWhere,
(predicatedWhere< (predicatedWhere<predicate,
typename std::remove_reference<_predicate>::type, typename std::remove_reference<left>::type,
typename std::remove_reference<_left>::type, typename std::remove_reference<right>::type>(pred, lhs,rhs)));
typename std::remove_reference<_right>::type>(pred, lhs,rhs)));
//////////////////////////////////////////// ////////////////////////////////////////////
// Operator syntactical glue // Operator syntactical glue
//////////////////////////////////////////// ////////////////////////////////////////////
#define GRID_UNOP(name) name
#define GRID_BINOP(name) name #define GRID_UNOP(name) name<decltype(eval(0, arg))>
#define GRID_TRINOP(name) name #define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_DEF_UNOP(op, name) \ #define GRID_DEF_UNOP(op, name) \
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \ template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
@ -462,6 +401,8 @@ GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose); GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa); GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(real, UnaryReal);
GRID_DEF_UNOP(imag, UnaryImag);
GRID_DEF_UNOP(toReal, UnaryToReal); GRID_DEF_UNOP(toReal, UnaryToReal);
GRID_DEF_UNOP(toComplex, UnaryToComplex); GRID_DEF_UNOP(toComplex, UnaryToComplex);
GRID_DEF_UNOP(timesI, UnaryTimesI); GRID_DEF_UNOP(timesI, UnaryTimesI);
@ -494,36 +435,29 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Op, class T1> template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr) auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{ {
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr); Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2> template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr) auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
{ {
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr); Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2, class T3> template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1), -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
vecEval(0, expr.arg2), eval(0, expr.arg2),
vecEval(0, expr.arg3)))> eval(0, expr.arg3)))>
{ {
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1), Lattice<decltype(expr.op.func(eval(0, expr.arg1),
vecEval(0, expr.arg2), eval(0, expr.arg2),
vecEval(0, expr.arg3)))> ret(expr); eval(0, expr.arg3)))> ret(expr);
return ret; return ret;
} }
#define EXPRESSION_CLOSURE(function) \
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> \
auto function(Expression &expr) -> decltype(function(closure(expr))) \
{ \
return function(closure(expr)); \
}
#undef GRID_UNOP #undef GRID_UNOP
#undef GRID_BINOP #undef GRID_BINOP

View File

@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
autoView( lhs_v , lhs, AcceleratorRead); autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
auto tmp =ret_v(ss);
mac(&tmp,&lhs_t,&rhs_t); mac(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
}); });
@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
auto tmp =ret_v(ss); decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
mac(&tmp,&lhs_t,&rhs); mac(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
autoView( rhs_v , lhs, AcceleratorRead); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
auto tmp =ret_v(ss); decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
mac(&tmp,&lhs,&rhs_t); mac(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);

View File

@ -123,9 +123,9 @@ public:
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard); auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,exprCopy); auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp); vstream(me[ss],tmp);
}); });
me.ViewClose(); me.ViewClose();
ExpressionViewClose(exprCopy); ExpressionViewClose(exprCopy);
@ -146,9 +146,9 @@ public:
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard); auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,exprCopy); auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp); vstream(me[ss],tmp);
}); });
me.ViewClose(); me.ViewClose();
ExpressionViewClose(exprCopy); ExpressionViewClose(exprCopy);
@ -168,9 +168,9 @@ public:
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard); auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,exprCopy); auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp); vstream(me[ss],tmp);
}); });
me.ViewClose(); me.ViewClose();
ExpressionViewClose(exprCopy); ExpressionViewClose(exprCopy);

View File

@ -54,34 +54,13 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
typedef decltype(basis[0].View(AcceleratorRead)) View; typedef decltype(basis[0].View(AcceleratorRead)) View;
Vector<View> basis_v; basis_v.reserve(basis.size()); Vector<View> basis_v; basis_v.reserve(basis.size());
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorWrite)); basis_v.push_back(basis[k].View(AcceleratorWrite));
} }
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
{
vobj* B = &Bt[Nm * thread_num()];
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
View *basis_vp = &basis_v[0]; View *basis_vp = &basis_v[0];
int nrot = j1-j0; int nrot = j1-j0;
@ -91,12 +70,14 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
uint64_t oSites =grid->oSites(); uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
Vector <vobj> Bt(siteBlock * nrot); Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0]; auto Bp=&Bt[0];
// GPU readable copy of matrix // GPU readable copy of matrix
Vector<Coeff_t> Qt_jv(Nm*Nm); Vector<double> Qt_jv(Nm*Nm);
Coeff_t *Qt_p = & Qt_jv[0]; double *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{ thread_for(i,Nm*Nm,{
int j = i/Nm; int j = i/Nm;
int k = i%Nm; int k = i%Nm;
@ -137,7 +118,6 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j])); coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
}); });
} }
#endif
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose(); for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
} }

View File

@ -42,6 +42,34 @@ NAMESPACE_BEGIN(Grid);
typedef iScalar<vInteger> vPredicate ; typedef iScalar<vInteger> vPredicate ;
/*
template <class iobj, class vobj, class robj> accelerator_inline
vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)
{
typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
ExtractBuffer<Integer> mask(Nsimd);
ExtractBuffer<scalar_object> truevals(Nsimd);
ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals);
extract(iffalse, falsevals);
extract<vInteger, Integer>(TensorRemove(predicate), mask);
for (int s = 0; s < Nsimd; s++) {
if (mask[s]) falsevals[s] = truevals[s];
}
merge(ret, falsevals);
return ret;
}
*/
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// compare lattice to lattice // compare lattice to lattice
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////

View File

@ -182,14 +182,6 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
return; return;
}; };
template<class vobj,class sobj>
inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site)
{
autoView(lv,l,CpuRead);
peekLocalSite(s,lv,site);
return;
};
// Must be CPU write view // Must be CPU write view
template<class vobj,class sobj> template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site) inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
@ -218,14 +210,6 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
return; return;
}; };
template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s, Lattice<vobj> &l,Coordinate &site)
{
autoView(lv,l,CpuWrite);
pokeLocalSite(s,lv,site);
return;
};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,79 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reality.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_REAL_IMAG_H
#define GRID_LATTICE_REAL_IMAG_H
// FIXME .. this is the sector of the code
// I am most worried about the directions
// The choice of burying complex in the SIMD
// is making the use of "real" and "imag" very cumbersome
NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> real(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] =real(lhs_v[ss]);
});
return ret;
};
template<class vobj> inline Lattice<vobj> imag(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] =imag(lhs_v[ss]);
});
return ret;
};
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto real(const Expression &expr) -> decltype(real(closure(expr)))
{
return real(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto imag(const Expression &expr) -> decltype(imag(closure(expr)))
{
return imag(closure(expr));
}
NAMESPACE_END(Grid);
#endif

View File

@ -2,13 +2,12 @@ NAMESPACE_BEGIN(Grid);
#ifdef GRID_HIP #ifdef GRID_HIP
extern hipDeviceProp_t *gpu_props; extern hipDeviceProp_t *gpu_props;
#define WARP_SIZE 64
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
extern cudaDeviceProp *gpu_props; extern cudaDeviceProp *gpu_props;
#define WARP_SIZE 32
#endif #endif
#define WARP_SIZE 32
__device__ unsigned int retirementCount = 0; __device__ unsigned int retirementCount = 0;
template <class Iterator> template <class Iterator>
@ -65,7 +64,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
// cannot use overloaded operators for sobj as they are not volatile-qualified // cannot use overloaded operators for sobj as they are not volatile-qualified
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj)); memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
acceleratorSynchronise(); __syncwarp();
const Iterator VEC = WARP_SIZE; const Iterator VEC = WARP_SIZE;
const Iterator vid = tid & (VEC-1); const Iterator vid = tid & (VEC-1);
@ -79,9 +78,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
beta += temp; beta += temp;
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj)); memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
} }
acceleratorSynchronise(); __syncwarp();
} }
acceleratorSynchroniseAll(); __syncthreads();
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
beta = Zero(); beta = Zero();
@ -91,7 +90,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
} }
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj)); memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
} }
acceleratorSynchroniseAll(); __syncthreads();
} }

View File

@ -240,8 +240,6 @@ template<class vobj,class vobj2,class CComplex>
autoView( fineX_ , fineX, AcceleratorRead); autoView( fineX_ , fineX, AcceleratorRead);
autoView( fineY_ , fineY, AcceleratorRead); autoView( fineY_ , fineY, AcceleratorRead);
autoView( coarseA_, coarseA, AcceleratorRead); autoView( coarseA_, coarseA, AcceleratorRead);
Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions;
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
@ -249,9 +247,9 @@ template<class vobj,class vobj2,class CComplex>
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions); Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
// z = A x + y // z = A x + y
#ifdef GRID_SIMT #ifdef GRID_SIMT
@ -355,14 +353,11 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( fineData_ , fineData, AcceleratorRead); autoView( fineData_ , fineData, AcceleratorRead);
Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions;
accelerator_for(sc,coarse->oSites(),1,{ accelerator_for(sc,coarse->oSites(),1,{
// One thread per sub block // One thread per sub block
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
coarseData_[sc]=Zero(); coarseData_[sc]=Zero();
for(int sb=0;sb<blockVol;sb++){ for(int sb=0;sb<blockVol;sb++){
@ -372,7 +367,7 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d]; for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions); Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
coarseData_[sc]=coarseData_[sc]+fineData_[sf]; coarseData_[sc]=coarseData_[sc]+fineData_[sf];
} }

View File

@ -130,8 +130,6 @@ public:
friend std::ostream& operator<< (std::ostream& stream, Logger& log){ friend std::ostream& operator<< (std::ostream& stream, Logger& log){
if ( log.active ) { if ( log.active ) {
std::ios_base::fmtflags f(stream.flags());
stream << log.background()<< std::left; stream << log.background()<< std::left;
if (log.topWidth > 0) if (log.topWidth > 0)
{ {
@ -154,8 +152,6 @@ public:
<< now << log.background() << " : " ; << now << log.background() << " : " ;
} }
stream << log.colour(); stream << log.colour();
stream.flags(f);
return stream; return stream;
} else { } else {
return devnull; return devnull;

View File

@ -1,4 +1,3 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
int Grid::BinaryIO::latticeWriteMaxRetry = -1; int Grid::BinaryIO::latticeWriteMaxRetry = -1;
Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;

View File

@ -79,13 +79,6 @@ inline void removeWhitespace(std::string &key)
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
class BinaryIO { class BinaryIO {
public: public:
struct IoPerf
{
uint64_t size{0},time{0};
double mbytesPerSecond{0.};
};
static IoPerf lastPerf;
static int latticeWriteMaxRetry; static int latticeWriteMaxRetry;
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
@ -509,15 +502,12 @@ class BinaryIO {
timer.Stop(); timer.Stop();
} }
lastPerf.size = sizeof(fobj)*iodata.size()*nrank;
lastPerf.time = timer.useconds();
lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
std::cout<<GridLogMessage<<"IOobject: "; std::cout<<GridLogMessage<<"IOobject: ";
if ( control & BINARYIO_READ) std::cout << " read "; if ( control & BINARYIO_READ) std::cout << " read ";
else std::cout << " write "; else std::cout << " write ";
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank; uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" " std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
<< lastPerf.mbytesPerSecond <<" MB/s "<<std::endl; << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl; std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
@ -673,15 +663,10 @@ class BinaryIO {
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
timer.Start(); timer.Start();
thread_for(lidx,lsites,{ // FIX ME, suboptimal implementation thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
Coordinate lcoor; parallel_rng.SetState(tmp,lidx);
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.SetState(tmp,gidx);
}); });
timer.Stop(); timer.Stop();
@ -738,12 +723,7 @@ class BinaryIO {
std::vector<RNGstate> iodata(lsites); std::vector<RNGstate> iodata(lsites);
thread_for(lidx,lsites,{ thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
Coordinate lcoor; parallel_rng.GetState(tmp,lidx);
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.GetState(tmp,gidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
}); });
timer.Stop(); timer.Stop();

View File

@ -47,7 +47,7 @@ static constexpr int Ym = 5;
static constexpr int Zm = 6; static constexpr int Zm = 6;
static constexpr int Tm = 7; static constexpr int Tm = 7;
static constexpr int Nc=Config_Nc; static constexpr int Nc=3;
static constexpr int Ns=4; static constexpr int Ns=4;
static constexpr int Nd=4; static constexpr int Nd=4;
static constexpr int Nhs=2; // half spinor static constexpr int Nhs=2; // half spinor

View File

@ -208,7 +208,7 @@ public:
LebesgueOrder LebesgueEvenOdd; LebesgueOrder LebesgueEvenOdd;
// Comms buffer // Comms buffer
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf; std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Conserved current utilities // Conserved current utilities

View File

@ -63,20 +63,17 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Generic Nc kernels // Generic Nc kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
template<int Naik> template<int Naik> accelerator_inline
static accelerator_inline
void DhopSiteGeneric(StencilView &st, void DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteGenericInt(StencilView &st, void DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteGenericExt(StencilView &st, void DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
@ -85,20 +82,17 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Nc=3 specific kernels // Nc=3 specific kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteHand(StencilView &st, void DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteHandInt(StencilView &st, void DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteHandExt(StencilView &st, void DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
@ -107,7 +101,6 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Asm Nc=3 specific kernels // Asm Nc=3 specific kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteAsm(StencilView &st, void DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,

View File

@ -74,20 +74,6 @@ public:
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }
void Report(void);
void ZeroCounters(void);
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
double DhopTotalTime;
double DerivCalls;
double DerivCommTime;
double DerivComputeTime;
double DerivDhopComputeTime;
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument // override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent // and also make interface more uniformly consistent
@ -210,3 +196,5 @@ typedef WilsonFermion<WilsonImplF> WilsonFermionF;
typedef WilsonFermion<WilsonImplD> WilsonFermionD; typedef WilsonFermion<WilsonImplD> WilsonFermionD;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -215,7 +215,7 @@ public:
LebesgueOrder LebesgueEvenOdd; LebesgueOrder LebesgueEvenOdd;
// Comms buffer // Comms buffer
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf; std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
}; };

View File

@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField tmp(UGrid); PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid); PropagatorField Utmp(UGrid);
PropagatorField zz (UGrid); zz=0.0; LatticeInteger zz (UGrid); zz=0.0;
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
for (int s=0;s<Ls;s++) { for (int s=0;s<Ls;s++) {
@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField tmp(UGrid); PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid); PropagatorField Utmp(UGrid);
PropagatorField zz (UGrid); zz=0.0; LatticeInteger zz (UGrid); zz=0.0;
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){

View File

@ -146,7 +146,7 @@ NAMESPACE_BEGIN(Grid);
template <class Impl> template <class Impl>
template <int Naik> accelerator_inline template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st, void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
@ -221,7 +221,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
template <class Impl> template <class Impl>
template <int Naik> accelerator_inline template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
@ -300,7 +300,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
template <class Impl> template <class Impl>
template <int Naik> accelerator_inline template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st, void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,

View File

@ -78,7 +78,7 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
// Int, Ext, Int+Ext cases for comms overlap // Int, Ext, Int+Ext cases for comms overlap
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
template <int Naik> accelerator_inline template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
@ -126,7 +126,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
// Only contributions from interior of our node // Only contributions from interior of our node
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <class Impl> template <class Impl>
template <int Naik> accelerator_inline template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
@ -174,7 +174,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
// Only contributions from exterior of our node // Only contributions from exterior of our node
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <class Impl> template <class Impl>
template <int Naik> accelerator_inline template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,

View File

@ -75,91 +75,6 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
StencilOdd.BuildSurfaceList(1,vol4); StencilOdd.BuildSurfaceList(1,vol4);
} }
template<class Impl>
void WilsonFermion<Impl>::Report(void)
{
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
RealD volume = 1;
Coordinate latt = _grid->GlobalDimensions();
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
if ( DhopCalls > 0 ) {
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl;
std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
}
if ( DerivCalls > 0 ) {
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " <<DerivCalls <<std::endl;
std::cout << GridLogMessage << "WilsonFermion CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
// how to count flops here?
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
std::cout << GridLogMessage << "Average mflops/s per call ? : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node ? : " << mflops/NP << std::endl;
// how to count flops here?
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) ? : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl; }
if (DerivCalls > 0 || DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
if ( DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
}
}
template<class Impl>
void WilsonFermion<Impl>::ZeroCounters(void) {
DhopCalls = 0; // ok
DhopCommTime = 0;
DhopComputeTime = 0;
DhopComputeTime2= 0;
DhopFaceTime = 0;
DhopTotalTime = 0;
DerivCalls = 0; // ok
DerivCommTime = 0;
DerivComputeTime = 0;
DerivDhopComputeTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
Stencil.ZeroCountersi();
StencilEven.ZeroCountersi();
StencilOdd.ZeroCountersi();
}
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
{ {
@ -319,7 +234,6 @@ template <class Impl>
void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
GaugeField &mat, const FermionField &A, GaugeField &mat, const FermionField &A,
const FermionField &B, int dag) { const FermionField &B, int dag) {
DerivCalls++;
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
@ -328,11 +242,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
FermionField Atilde(B.Grid()); FermionField Atilde(B.Grid());
Atilde = A; Atilde = A;
DerivCommTime-=usecond();
st.HaloExchange(B, compressor); st.HaloExchange(B, compressor);
DerivCommTime+=usecond();
DerivComputeTime-=usecond();
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Flip gamma (1+g)<->(1-g) if dag // Flip gamma (1+g)<->(1-g) if dag
@ -340,7 +251,6 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
int gamma = mu; int gamma = mu;
if (!dag) gamma += Nd; if (!dag) gamma += Nd;
DerivDhopComputeTime -= usecond();
int Ls=1; int Ls=1;
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma); Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
@ -348,9 +258,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
// spin trace outer product // spin trace outer product
////////////////////////////////////////////////// //////////////////////////////////////////////////
Impl::InsertForce4D(mat, Btilde, Atilde, mu); Impl::InsertForce4D(mat, Btilde, Atilde, mu);
DerivDhopComputeTime += usecond();
} }
DerivComputeTime += usecond();
} }
template <class Impl> template <class Impl>
@ -484,14 +392,13 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
const FermionField &in, const FermionField &in,
FermionField &out, int dag) FermionField &out, int dag)
{ {
DhopTotalTime-=usecond();
#ifdef GRID_OMP #ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag); DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else else
#endif #endif
DhopInternalSerial(st,lo,U,in,out,dag); DhopInternalSerial(st,lo,U,in,out,dag);
DhopTotalTime+=usecond();
} }
template <class Impl> template <class Impl>
@ -510,53 +417,38 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
///////////////////////////// /////////////////////////////
std::vector<std::vector<CommsRequest_t> > requests; std::vector<std::vector<CommsRequest_t> > requests;
st.Prepare(); st.Prepare();
DhopFaceTime-=usecond();
st.HaloGather(in,compressor); st.HaloGather(in,compressor);
DhopFaceTime+=usecond();
DhopCommTime -=usecond();
st.CommunicateBegin(requests); st.CommunicateBegin(requests);
///////////////////////////// /////////////////////////////
// Overlap with comms // Overlap with comms
///////////////////////////// /////////////////////////////
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor); st.CommsMergeSHM(compressor);
DhopFaceTime+=usecond();
///////////////////////////// /////////////////////////////
// do the compute interior // do the compute interior
///////////////////////////// /////////////////////////////
int Opt = WilsonKernelsStatic::Opt; int Opt = WilsonKernelsStatic::Opt;
DhopComputeTime-=usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
} else { } else {
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
} }
DhopComputeTime+=usecond();
///////////////////////////// /////////////////////////////
// Complete comms // Complete comms
///////////////////////////// /////////////////////////////
st.CommunicateComplete(requests); st.CommunicateComplete(requests);
DhopCommTime +=usecond();
DhopFaceTime-=usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime+=usecond();
///////////////////////////// /////////////////////////////
// do the compute exterior // do the compute exterior
///////////////////////////// /////////////////////////////
DhopComputeTime2-=usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
} else { } else {
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
} }
DhopComputeTime2+=usecond();
}; };
@ -568,18 +460,14 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
{ {
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
DhopCommTime-=usecond();
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);
DhopCommTime+=usecond();
DhopComputeTime-=usecond();
int Opt = WilsonKernelsStatic::Opt; int Opt = WilsonKernelsStatic::Opt;
if (dag == DaggerYes) { if (dag == DaggerYes) {
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
} else { } else {
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
} }
DhopComputeTime+=usecond();
}; };
/*Change ends */ /*Change ends */

View File

@ -1,574 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
//#if defined(A64FXASM)
#if defined(A64FX)
// safety include
#include <arm_sve.h>
// undefine everything related to kernels
#include <simd/Fujitsu_A64FX_undef.h>
// enable A64FX body
#define WILSONKERNELSASMBODYA64FX
//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")
///////////////////////////////////////////////////////////
// If we are A64FX specialise the single precision routine
///////////////////////////////////////////////////////////
#if defined(DSLASHINTRIN)
//#pragma message ("A64FX Dslash: intrin")
#include <simd/Fujitsu_A64FX_intrin_single.h>
#else
#pragma message ("A64FX Dslash: asm")
#include <simd/Fujitsu_A64FX_asm_single.h>
#endif
/// Switch off the 5d vectorised code optimisations
#undef DWFVEC5D
/////////////////////////////////////////////////////////////////
// XYZT vectorised, undag Kernel, single
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
/////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, single
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// undefine
#include <simd/Fujitsu_A64FX_undef.h>
///////////////////////////////////////////////////////////
// If we are A64FX specialise the double precision routine
///////////////////////////////////////////////////////////
#if defined(DSLASHINTRIN)
#include <simd/Fujitsu_A64FX_intrin_double.h>
#else
#include <simd/Fujitsu_A64FX_asm_double.h>
#endif
// former KNL
//#define MAYBEPERM(A,perm) if (perm) { A ; }
//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
//#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
/////////////////////////////////////////////////////////////////
// XYZT vectorised, undag Kernel, double
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
/////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, double
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// undefs
#undef WILSONKERNELSASMBODYA64FX
#include <simd/Fujitsu_A64FX_undef.h>
#endif //A64FXASM

View File

@ -1,380 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: WilsonKernelsAsmBodyA64FX.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifdef KERNEL_DAG
#define DIR0_PROJ XP_PROJ
#define DIR1_PROJ YP_PROJ
#define DIR2_PROJ ZP_PROJ
#define DIR3_PROJ TP_PROJ
#define DIR4_PROJ XM_PROJ
#define DIR5_PROJ YM_PROJ
#define DIR6_PROJ ZM_PROJ
#define DIR7_PROJ TM_PROJ
#define DIR0_RECON XP_RECON
#define DIR1_RECON YP_RECON_ACCUM
#define DIR2_RECON ZP_RECON_ACCUM
#define DIR3_RECON TP_RECON_ACCUM
#define DIR4_RECON XM_RECON_ACCUM
#define DIR5_RECON YM_RECON_ACCUM
#define DIR6_RECON ZM_RECON_ACCUM
#define DIR7_RECON TM_RECON_ACCUM
#else
#define DIR0_PROJ XM_PROJ
#define DIR1_PROJ YM_PROJ
#define DIR2_PROJ ZM_PROJ
#define DIR3_PROJ TM_PROJ
#define DIR4_PROJ XP_PROJ
#define DIR5_PROJ YP_PROJ
#define DIR6_PROJ ZP_PROJ
#define DIR7_PROJ TP_PROJ
#define DIR0_RECON XM_RECON
#define DIR1_RECON YM_RECON_ACCUM
#define DIR2_RECON ZM_RECON_ACCUM
#define DIR3_RECON TM_RECON_ACCUM
#define DIR4_RECON XP_RECON_ACCUM
#define DIR5_RECON YP_RECON_ACCUM
#define DIR6_RECON ZP_RECON_ACCUM
#define DIR7_RECON TP_RECON_ACCUM
#endif
//using namespace std;
#undef SHOW
//#define SHOW
#undef WHERE
#ifdef INTERIOR_AND_EXTERIOR
#define WHERE "INT_AND_EXT"
#endif
#ifdef INTERIOR
#define WHERE "INT"
#endif
#ifdef EXTERIOR
#define WHERE "EXT"
#endif
//#pragma message("here")
////////////////////////////////////////////////////////////////////////////////
// Comms then compute kernel
////////////////////////////////////////////////////////////////////////////////
#ifdef INTERIOR_AND_EXTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \
LOAD_CHIMU(base); \
LOAD_TABLE(PERMUTE_DIR); \
PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \
} else { \
LOAD_CHI(base); \
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
PREFETCH_CHIMU_L2(basep); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PREFETCH1_CHIMU(base); \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif
////////////////////////////////////////////////////////////////////////////////
// Pre comms kernel -- prefetch like normal because it is mostly right
////////////////////////////////////////////////////////////////////////////////
#ifdef INTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \
LOAD_CHIMU(base); \
LOAD_TABLE(PERMUTE_DIR); \
PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
PREFETCH_CHIMU_L2(basep); \
} else { PREFETCH_CHIMU(base); } \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PREFETCH1_CHIMU(base); \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif
////////////////////////////////////////////////////////////////////////////////
// Post comms kernel
////////////////////////////////////////////////////////////////////////////////
#ifdef EXTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
}
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
nmu=0; \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
}
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
#endif
{
int nmu;
int local,perm, ptype;
uint64_t base;
uint64_t basep;
const uint64_t plocal =(uint64_t) & in[0];
MASK_REGS;
int nmax=U.oSites();
for(int site=0;site<Ns;site++) {
#ifndef EXTERIOR
// int sU =lo.Reorder(ssU);
int sU =ssU;
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
// int sUn=lo.Reorder(ssn);
int sUn=ssn;
LOCK_GAUGE(0);
#else
int sU =ssU;
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
int sUn=ssn;
#endif
for(int s=0;s<Ls;s++) {
ss =sU*Ls+s;
ssn=sUn*Ls+s;
int ent=ss*8;// 2*Ndim
int nent=ssn*8;
uint64_t delta_base, delta_base_p;
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON);
#ifdef SHOW
float rescale = 64. * 12.;
std::cout << "=================================================================" << std::endl;
std::cout << "ss = " << ss << " ssn = " << ssn << std::endl;
std::cout << "sU = " << sU << " ssU = " << ssU << std::endl;
std::cout << " " << std::endl;
std::cout << "Dir = " << Xp << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Xp] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON);
#ifdef SHOW
std::cout << "Dir = " << Yp << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Yp] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON);
#ifdef SHOW
std::cout << "Dir = " << Zp << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Zp] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON);
#ifdef SHOW
std::cout << "Dir = " << Tp << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Tp] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON);
#ifdef SHOW
std::cout << "Dir = " << Xm << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Xm] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
#ifdef SHOW
std::cout << "Dir = " << Ym << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Ym] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
#ifdef SHOW
std::cout << "Dir = " << Zm << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Zm] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
#ifdef SHOW
std::cout << "Dir = " << Tm << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Tm] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
#ifdef EXTERIOR
if (nmu==0) break;
// if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
#endif
base = (uint64_t) &out[ss];
basep= st.GetPFInfo(nent,plocal); ent++;
basep = (uint64_t) &out[ssn];
RESULT(base,basep);
#ifdef SHOW
std::cout << "Dir = FINAL " << WHERE<< std::endl;;
base_ss = base;
std::cout << "base = " << (base - (uint64_t) &out[0])/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
}
ssU++;
UNLOCK_GAUGE(0);
}
}
#undef DIR0_PROJ
#undef DIR1_PROJ
#undef DIR2_PROJ
#undef DIR3_PROJ
#undef DIR4_PROJ
#undef DIR5_PROJ
#undef DIR6_PROJ
#undef DIR7_PROJ
#undef DIR0_RECON
#undef DIR1_RECON
#undef DIR2_RECON
#undef DIR3_RECON
#undef DIR4_RECON
#undef DIR5_RECON
#undef DIR6_RECON
#undef DIR7_RECON
#undef ASM_LEG
#undef ASM_LEG_XP
#undef RESULT

View File

@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
HAND_RESULT_EXT(ss,F) HAND_RESULT_EXT(ss,F)
#define HAND_SPECIALISE_GPARITY(IMPL) \ #define HAND_SPECIALISE_GPARITY(IMPL) \
template<> accelerator_inline void \ template<> void \
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> accelerator_inline void \ template<> void \
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> accelerator_inline void \ template<> void \
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> accelerator_inline void \ template<> void \
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> accelerator_inline void \ template<> void \
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
nmu = 0; \ nmu = 0; \
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
template<> accelerator_inline void \ template<> void \
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \

View File

@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Impl> accelerator_inline void template<class Impl> void
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
HAND_RESULT(ss); HAND_RESULT(ss);
} }
template<class Impl> accelerator_inline template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
HAND_RESULT(ss); HAND_RESULT(ss);
} }
template<class Impl> accelerator_inline void template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
HAND_RESULT(ss); HAND_RESULT(ss);
} }
template<class Impl> accelerator_inline template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
HAND_RESULT(ss); HAND_RESULT(ss);
} }
template<class Impl> accelerator_inline void template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
HAND_RESULT_EXT(ss); HAND_RESULT_EXT(ss);
} }
template<class Impl> accelerator_inline template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {

View File

@ -1,943 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/FermionCore.h>
#undef LOAD_CHIMU
#undef LOAD_CHI
#undef MULT_2SPIN
#undef PERMUTE_DIR
#undef XP_PROJ
#undef YP_PROJ
#undef ZP_PROJ
#undef TP_PROJ
#undef XM_PROJ
#undef YM_PROJ
#undef ZM_PROJ
#undef TM_PROJ
#undef XP_RECON
#undef XP_RECON_ACCUM
#undef XM_RECON
#undef XM_RECON_ACCUM
#undef YP_RECON_ACCUM
#undef YM_RECON_ACCUM
#undef ZP_RECON_ACCUM
#undef ZM_RECON_ACCUM
#undef TP_RECON_ACCUM
#undef TM_RECON_ACCUM
#undef ZERO_RESULT
#undef Chimu_00
#undef Chimu_01
#undef Chimu_02
#undef Chimu_10
#undef Chimu_11
#undef Chimu_12
#undef Chimu_20
#undef Chimu_21
#undef Chimu_22
#undef Chimu_30
#undef Chimu_31
#undef Chimu_32
#undef HAND_STENCIL_LEG
#undef HAND_STENCIL_LEG_INT
#undef HAND_STENCIL_LEG_EXT
#undef HAND_RESULT
#undef HAND_RESULT_INT
#undef HAND_RESULT_EXT
#define REGISTER
#define LOAD_CHIMU \
{const SiteSpinor & ref (in[offset]); \
Chimu_00=ref()(0)(0);\
Chimu_01=ref()(0)(1);\
Chimu_02=ref()(0)(2);\
Chimu_10=ref()(1)(0);\
Chimu_11=ref()(1)(1);\
Chimu_12=ref()(1)(2);\
Chimu_20=ref()(2)(0);\
Chimu_21=ref()(2)(1);\
Chimu_22=ref()(2)(2);\
Chimu_30=ref()(3)(0);\
Chimu_31=ref()(3)(1);\
Chimu_32=ref()(3)(2);\
std::cout << std::endl << "DEBUG -- LOAD_CHIMU" << std::endl; \
std::cout << "Chimu_00 -- " << Chimu_00 << std::endl; \
std::cout << "Chimu_01 -- " << Chimu_01 << std::endl; \
std::cout << "Chimu_02 -- " << Chimu_02 << std::endl; \
std::cout << "Chimu_10 -- " << Chimu_10 << std::endl; \
std::cout << "Chimu_11 -- " << Chimu_11 << std::endl; \
std::cout << "Chimu_12 -- " << Chimu_12 << std::endl; \
std::cout << "Chimu_20 -- " << Chimu_20 << std::endl; \
std::cout << "Chimu_21 -- " << Chimu_21 << std::endl; \
std::cout << "Chimu_22 -- " << Chimu_22 << std::endl; \
std::cout << "Chimu_30 -- " << Chimu_30 << std::endl; \
std::cout << "Chimu_31 -- " << Chimu_31 << std::endl; \
std::cout << "Chimu_32 -- " << Chimu_32 << std::endl; \
}
#define LOAD_CHI\
{const SiteHalfSpinor &ref(buf[offset]); \
Chi_00 = ref()(0)(0);\
Chi_01 = ref()(0)(1);\
Chi_02 = ref()(0)(2);\
Chi_10 = ref()(1)(0);\
Chi_11 = ref()(1)(1);\
Chi_12 = ref()(1)(2);\
std::cout << std::endl << "DEBUG -- LOAD_CHI" << std::endl; \
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
std::cout << "Chi_12 -- " << Chi_12 << std::endl; \
}
// To splat or not to splat depends on the implementation
#define MULT_2SPIN(A)\
{auto & ref(U[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
UChi_00 = U_00*Chi_00;\
UChi_10 = U_00*Chi_10;\
UChi_01 = U_10*Chi_00;\
UChi_11 = U_10*Chi_10;\
UChi_02 = U_20*Chi_00;\
UChi_12 = U_20*Chi_10;\
UChi_00+= U_01*Chi_01;\
UChi_10+= U_01*Chi_11;\
UChi_01+= U_11*Chi_01;\
UChi_11+= U_11*Chi_11;\
UChi_02+= U_21*Chi_01;\
UChi_12+= U_21*Chi_11;\
Impl::loadLinkElement(U_00,ref()(0,2)); \
Impl::loadLinkElement(U_10,ref()(1,2)); \
Impl::loadLinkElement(U_20,ref()(2,2)); \
UChi_00+= U_00*Chi_02;\
UChi_10+= U_00*Chi_12;\
UChi_01+= U_10*Chi_02;\
UChi_11+= U_10*Chi_12;\
UChi_02+= U_20*Chi_02;\
UChi_12+= U_20*Chi_12;\
std::cout << std::endl << "DEBUG -- MULT_2SPIN" << std::endl; \
std::cout << "UChi_00 -- " << UChi_00 << std::endl; \
std::cout << "UChi_01 -- " << UChi_01 << std::endl; \
std::cout << "UChi_02 -- " << UChi_02 << std::endl; \
std::cout << "UChi_10 -- " << UChi_10 << std::endl; \
std::cout << "UChi_11 -- " << UChi_11 << std::endl; \
std::cout << "UChi_12 -- " << UChi_12 << std::endl; \
}
#define PERMUTE_DIR(dir) \
std::cout << std::endl << "DEBUG -- PERM PRE" << std::endl; \
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
std::cout << "Chi_12 -- " << Chi_12 << std::endl; \
permute##dir(Chi_00,Chi_00);\
permute##dir(Chi_01,Chi_01);\
permute##dir(Chi_02,Chi_02);\
permute##dir(Chi_10,Chi_10);\
permute##dir(Chi_11,Chi_11);\
permute##dir(Chi_12,Chi_12);\
std::cout << std::endl << "DEBUG -- PERM POST" << std::endl; \
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
#define XP_PROJ \
Chi_00 = Chimu_00+timesI(Chimu_30);\
Chi_01 = Chimu_01+timesI(Chimu_31);\
Chi_02 = Chimu_02+timesI(Chimu_32);\
Chi_10 = Chimu_10+timesI(Chimu_20);\
Chi_11 = Chimu_11+timesI(Chimu_21);\
Chi_12 = Chimu_12+timesI(Chimu_22);\
std::cout << std::endl << "DEBUG -- XP_PROJ" << std::endl; \
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
#define YP_PROJ \
Chi_00 = Chimu_00-Chimu_30;\
Chi_01 = Chimu_01-Chimu_31;\
Chi_02 = Chimu_02-Chimu_32;\
Chi_10 = Chimu_10+Chimu_20;\
Chi_11 = Chimu_11+Chimu_21;\
Chi_12 = Chimu_12+Chimu_22;\
std::cout << std::endl << "DEBUG -- YP_PROJ" << std::endl; \
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
#define ZP_PROJ \
Chi_00 = Chimu_00+timesI(Chimu_20); \
Chi_01 = Chimu_01+timesI(Chimu_21); \
Chi_02 = Chimu_02+timesI(Chimu_22); \
Chi_10 = Chimu_10-timesI(Chimu_30); \
Chi_11 = Chimu_11-timesI(Chimu_31); \
Chi_12 = Chimu_12-timesI(Chimu_32);\
std::cout << std::endl << "DEBUG -- ZP_PROJ" << std::endl; \
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
#define TP_PROJ \
Chi_00 = Chimu_00+Chimu_20; \
Chi_01 = Chimu_01+Chimu_21; \
Chi_02 = Chimu_02+Chimu_22; \
Chi_10 = Chimu_10+Chimu_30; \
Chi_11 = Chimu_11+Chimu_31; \
Chi_12 = Chimu_12+Chimu_32;\
std::cout << std::endl << "DEBUG -- TP_PROJ" << std::endl; \
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
// hspin(0)=fspin(0)-timesI(fspin(3));
// hspin(1)=fspin(1)-timesI(fspin(2));
#define XM_PROJ \
Chi_00 = Chimu_00-timesI(Chimu_30);\
Chi_01 = Chimu_01-timesI(Chimu_31);\
Chi_02 = Chimu_02-timesI(Chimu_32);\
Chi_10 = Chimu_10-timesI(Chimu_20);\
Chi_11 = Chimu_11-timesI(Chimu_21);\
Chi_12 = Chimu_12-timesI(Chimu_22);\
std::cout << std::endl << "DEBUG -- XM_PROJ" << std::endl; \
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
#define YM_PROJ \
Chi_00 = Chimu_00+Chimu_30;\
Chi_01 = Chimu_01+Chimu_31;\
Chi_02 = Chimu_02+Chimu_32;\
Chi_10 = Chimu_10-Chimu_20;\
Chi_11 = Chimu_11-Chimu_21;\
Chi_12 = Chimu_12-Chimu_22;\
std::cout << std::endl << "DEBUG -- YM_PROJ" << std::endl; \
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
#define ZM_PROJ \
Chi_00 = Chimu_00-timesI(Chimu_20); \
Chi_01 = Chimu_01-timesI(Chimu_21); \
Chi_02 = Chimu_02-timesI(Chimu_22); \
Chi_10 = Chimu_10+timesI(Chimu_30); \
Chi_11 = Chimu_11+timesI(Chimu_31); \
Chi_12 = Chimu_12+timesI(Chimu_32);\
std::cout << std::endl << "DEBUG -- ZM_PROJ" << std::endl; \
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
#define TM_PROJ \
Chi_00 = Chimu_00-Chimu_20; \
Chi_01 = Chimu_01-Chimu_21; \
Chi_02 = Chimu_02-Chimu_22; \
Chi_10 = Chimu_10-Chimu_30; \
Chi_11 = Chimu_11-Chimu_31; \
Chi_12 = Chimu_12-Chimu_32;\
std::cout << std::endl << "DEBUG -- TM_PROJ" << std::endl; \
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
// fspin(0)=hspin(0);
// fspin(1)=hspin(1);
// fspin(2)=timesMinusI(hspin(1));
// fspin(3)=timesMinusI(hspin(0));
#define XP_RECON\
result_00 = UChi_00;\
result_01 = UChi_01;\
result_02 = UChi_02;\
result_10 = UChi_10;\
result_11 = UChi_11;\
result_12 = UChi_12;\
result_20 = timesMinusI(UChi_10);\
result_21 = timesMinusI(UChi_11);\
result_22 = timesMinusI(UChi_12);\
result_30 = timesMinusI(UChi_00);\
result_31 = timesMinusI(UChi_01);\
result_32 = timesMinusI(UChi_02);\
std::cout << std::endl << "DEBUG -- XP_RECON" << std::endl; \
std::cout << "result_00 -- " << result_00 << std::endl; \
std::cout << "result_01 -- " << result_01 << std::endl; \
std::cout << "result_02 -- " << result_02 << std::endl; \
std::cout << "result_10 -- " << result_10 << std::endl; \
std::cout << "result_11 -- " << result_11 << std::endl; \
std::cout << "result_12 -- " << result_12 << std::endl; \
std::cout << "result_20 -- " << result_20 << std::endl; \
std::cout << "result_21 -- " << result_21 << std::endl; \
std::cout << "result_22 -- " << result_22 << std::endl; \
std::cout << "result_30 -- " << result_30 << std::endl; \
std::cout << "result_31 -- " << result_31 << std::endl; \
std::cout << "result_32 -- " << result_32 << std::endl;
#define XP_RECON_ACCUM\
result_00+=UChi_00;\
result_01+=UChi_01;\
result_02+=UChi_02;\
result_10+=UChi_10;\
result_11+=UChi_11;\
result_12+=UChi_12;\
result_20-=timesI(UChi_10);\
result_21-=timesI(UChi_11);\
result_22-=timesI(UChi_12);\
result_30-=timesI(UChi_00);\
result_31-=timesI(UChi_01);\
result_32-=timesI(UChi_02);\
std::cout << std::endl << "DEBUG -- XP_RECON_ACCUM" << std::endl; \
std::cout << "result_00 -- " << result_00 << std::endl; \
std::cout << "result_01 -- " << result_01 << std::endl; \
std::cout << "result_02 -- " << result_02 << std::endl; \
std::cout << "result_10 -- " << result_10 << std::endl; \
std::cout << "result_11 -- " << result_11 << std::endl; \
std::cout << "result_12 -- " << result_12 << std::endl; \
std::cout << "result_20 -- " << result_20 << std::endl; \
std::cout << "result_21 -- " << result_21 << std::endl; \
std::cout << "result_22 -- " << result_22 << std::endl; \
std::cout << "result_30 -- " << result_30 << std::endl; \
std::cout << "result_31 -- " << result_31 << std::endl; \
std::cout << "result_32 -- " << result_32 << std::endl;
#define XM_RECON\
result_00 = UChi_00;\
result_01 = UChi_01;\
result_02 = UChi_02;\
result_10 = UChi_10;\
result_11 = UChi_11;\
result_12 = UChi_12;\
result_20 = timesI(UChi_10);\
result_21 = timesI(UChi_11);\
result_22 = timesI(UChi_12);\
result_30 = timesI(UChi_00);\
result_31 = timesI(UChi_01);\
result_32 = timesI(UChi_02);\
std::cout << std::endl << "DEBUG -- XM_RECON" << std::endl; \
std::cout << "result_00 -- " << result_00 << std::endl; \
std::cout << "result_01 -- " << result_01 << std::endl; \
std::cout << "result_02 -- " << result_02 << std::endl; \
std::cout << "result_10 -- " << result_10 << std::endl; \
std::cout << "result_11 -- " << result_11 << std::endl; \
std::cout << "result_12 -- " << result_12 << std::endl; \
std::cout << "result_20 -- " << result_20 << std::endl; \
std::cout << "result_21 -- " << result_21 << std::endl; \
std::cout << "result_22 -- " << result_22 << std::endl; \
std::cout << "result_30 -- " << result_30 << std::endl; \
std::cout << "result_31 -- " << result_31 << std::endl; \
std::cout << "result_32 -- " << result_32 << std::endl;
#define XM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= timesI(UChi_10);\
result_21+= timesI(UChi_11);\
result_22+= timesI(UChi_12);\
result_30+= timesI(UChi_00);\
result_31+= timesI(UChi_01);\
result_32+= timesI(UChi_02);\
std::cout << std::endl << "DEBUG -- XM_RECON_ACCUM" << std::endl; \
std::cout << "result_00 -- " << result_00 << std::endl; \
std::cout << "result_01 -- " << result_01 << std::endl; \
std::cout << "result_02 -- " << result_02 << std::endl; \
std::cout << "result_10 -- " << result_10 << std::endl; \
std::cout << "result_11 -- " << result_11 << std::endl; \
std::cout << "result_12 -- " << result_12 << std::endl; \
std::cout << "result_20 -- " << result_20 << std::endl; \
std::cout << "result_21 -- " << result_21 << std::endl; \
std::cout << "result_22 -- " << result_22 << std::endl; \
std::cout << "result_30 -- " << result_30 << std::endl; \
std::cout << "result_31 -- " << result_31 << std::endl; \
std::cout << "result_32 -- " << result_32 << std::endl;
#define YP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= UChi_10;\
result_21+= UChi_11;\
result_22+= UChi_12;\
result_30-= UChi_00;\
result_31-= UChi_01;\
result_32-= UChi_02;\
std::cout << std::endl << "DEBUG -- YP_RECON_ACCUM" << std::endl; \
std::cout << "result_00 -- " << result_00 << std::endl; \
std::cout << "result_01 -- " << result_01 << std::endl; \
std::cout << "result_02 -- " << result_02 << std::endl; \
std::cout << "result_10 -- " << result_10 << std::endl; \
std::cout << "result_11 -- " << result_11 << std::endl; \
std::cout << "result_12 -- " << result_12 << std::endl; \
std::cout << "result_20 -- " << result_20 << std::endl; \
std::cout << "result_21 -- " << result_21 << std::endl; \
std::cout << "result_22 -- " << result_22 << std::endl; \
std::cout << "result_30 -- " << result_30 << std::endl; \
std::cout << "result_31 -- " << result_31 << std::endl; \
std::cout << "result_32 -- " << result_32 << std::endl;
#define YM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= UChi_10;\
result_21-= UChi_11;\
result_22-= UChi_12;\
result_30+= UChi_00;\
result_31+= UChi_01;\
result_32+= UChi_02;\
std::cout << std::endl << "DEBUG -- YM_RECON_ACCUM" << std::endl; \
std::cout << "result_00 -- " << result_00 << std::endl; \
std::cout << "result_01 -- " << result_01 << std::endl; \
std::cout << "result_02 -- " << result_02 << std::endl; \
std::cout << "result_10 -- " << result_10 << std::endl; \
std::cout << "result_11 -- " << result_11 << std::endl; \
std::cout << "result_12 -- " << result_12 << std::endl; \
std::cout << "result_20 -- " << result_20 << std::endl; \
std::cout << "result_21 -- " << result_21 << std::endl; \
std::cout << "result_22 -- " << result_22 << std::endl; \
std::cout << "result_30 -- " << result_30 << std::endl; \
std::cout << "result_31 -- " << result_31 << std::endl; \
std::cout << "result_32 -- " << result_32 << std::endl;
#define ZP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= timesI(UChi_00); \
result_21-= timesI(UChi_01); \
result_22-= timesI(UChi_02); \
result_30+= timesI(UChi_10); \
result_31+= timesI(UChi_11); \
result_32+= timesI(UChi_12);\
std::cout << std::endl << "DEBUG -- ZP_RECON_ACCUM" << std::endl; \
std::cout << "result_00 -- " << result_00 << std::endl; \
std::cout << "result_01 -- " << result_01 << std::endl; \
std::cout << "result_02 -- " << result_02 << std::endl; \
std::cout << "result_10 -- " << result_10 << std::endl; \
std::cout << "result_11 -- " << result_11 << std::endl; \
std::cout << "result_12 -- " << result_12 << std::endl; \
std::cout << "result_20 -- " << result_20 << std::endl; \
std::cout << "result_21 -- " << result_21 << std::endl; \
std::cout << "result_22 -- " << result_22 << std::endl; \
std::cout << "result_30 -- " << result_30 << std::endl; \
std::cout << "result_31 -- " << result_31 << std::endl; \
std::cout << "result_32 -- " << result_32 << std::endl;
#define ZM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= timesI(UChi_00); \
result_21+= timesI(UChi_01); \
result_22+= timesI(UChi_02); \
result_30-= timesI(UChi_10); \
result_31-= timesI(UChi_11); \
result_32-= timesI(UChi_12);\
std::cout << std::endl << "DEBUG -- ZM_RECON_ACCUM" << std::endl; \
std::cout << "result_00 -- " << result_00 << std::endl; \
std::cout << "result_01 -- " << result_01 << std::endl; \
std::cout << "result_02 -- " << result_02 << std::endl; \
std::cout << "result_10 -- " << result_10 << std::endl; \
std::cout << "result_11 -- " << result_11 << std::endl; \
std::cout << "result_12 -- " << result_12 << std::endl; \
std::cout << "result_20 -- " << result_20 << std::endl; \
std::cout << "result_21 -- " << result_21 << std::endl; \
std::cout << "result_22 -- " << result_22 << std::endl; \
std::cout << "result_30 -- " << result_30 << std::endl; \
std::cout << "result_31 -- " << result_31 << std::endl; \
std::cout << "result_32 -- " << result_32 << std::endl;
#define TP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= UChi_00; \
result_21+= UChi_01; \
result_22+= UChi_02; \
result_30+= UChi_10; \
result_31+= UChi_11; \
result_32+= UChi_12;\
std::cout << std::endl << "DEBUG -- TP_RECON_ACCUM" << std::endl; \
std::cout << "result_00 -- " << result_00 << std::endl; \
std::cout << "result_01 -- " << result_01 << std::endl; \
std::cout << "result_02 -- " << result_02 << std::endl; \
std::cout << "result_10 -- " << result_10 << std::endl; \
std::cout << "result_11 -- " << result_11 << std::endl; \
std::cout << "result_12 -- " << result_12 << std::endl; \
std::cout << "result_20 -- " << result_20 << std::endl; \
std::cout << "result_21 -- " << result_21 << std::endl; \
std::cout << "result_22 -- " << result_22 << std::endl; \
std::cout << "result_30 -- " << result_30 << std::endl; \
std::cout << "result_31 -- " << result_31 << std::endl; \
std::cout << "result_32 -- " << result_32 << std::endl;
#define TM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= UChi_00; \
result_21-= UChi_01; \
result_22-= UChi_02; \
result_30-= UChi_10; \
result_31-= UChi_11; \
result_32-= UChi_12;\
std::cout << std::endl << "DEBUG -- TM_RECON_ACCUM" << std::endl; \
std::cout << "result_00 -- " << result_00 << std::endl; \
std::cout << "result_01 -- " << result_01 << std::endl; \
std::cout << "result_02 -- " << result_02 << std::endl; \
std::cout << "result_10 -- " << result_10 << std::endl; \
std::cout << "result_11 -- " << result_11 << std::endl; \
std::cout << "result_12 -- " << result_12 << std::endl; \
std::cout << "result_20 -- " << result_20 << std::endl; \
std::cout << "result_21 -- " << result_21 << std::endl; \
std::cout << "result_22 -- " << result_22 << std::endl; \
std::cout << "result_30 -- " << result_30 << std::endl; \
std::cout << "result_31 -- " << result_31 << std::endl; \
std::cout << "result_32 -- " << result_32 << std::endl;
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU; \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI; \
} \
MULT_2SPIN(DIR); \
RECON;
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU; \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else if ( st.same_node[DIR] ) { \
LOAD_CHI; \
} \
if (local || st.same_node[DIR] ) { \
MULT_2SPIN(DIR); \
RECON; \
}
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
LOAD_CHI; \
MULT_2SPIN(DIR); \
RECON; \
nmu++; \
}
#define HAND_RESULT(ss) \
{ \
SiteSpinor & ref (out[ss]); \
vstream(ref()(0)(0),result_00); \
vstream(ref()(0)(1),result_01); \
vstream(ref()(0)(2),result_02); \
vstream(ref()(1)(0),result_10); \
vstream(ref()(1)(1),result_11); \
vstream(ref()(1)(2),result_12); \
vstream(ref()(2)(0),result_20); \
vstream(ref()(2)(1),result_21); \
vstream(ref()(2)(2),result_22); \
vstream(ref()(3)(0),result_30); \
vstream(ref()(3)(1),result_31); \
vstream(ref()(3)(2),result_32); \
std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \
std::cout << "result_00 -- " << result_00 << std::endl; \
std::cout << "result_01 -- " << result_01 << std::endl; \
std::cout << "result_02 -- " << result_02 << std::endl; \
std::cout << "result_10 -- " << result_10 << std::endl; \
std::cout << "result_11 -- " << result_11 << std::endl; \
std::cout << "result_12 -- " << result_12 << std::endl; \
std::cout << "result_20 -- " << result_20 << std::endl; \
std::cout << "result_21 -- " << result_21 << std::endl; \
std::cout << "result_22 -- " << result_22 << std::endl; \
std::cout << "result_30 -- " << result_30 << std::endl; \
std::cout << "result_31 -- " << result_31 << std::endl; \
std::cout << "result_32 -- " << result_32 << std::endl;\
}
#define HAND_RESULT_EXT(ss) \
if (nmu){ \
SiteSpinor & ref (out[ss]); \
ref()(0)(0)+=result_00; \
ref()(0)(1)+=result_01; \
ref()(0)(2)+=result_02; \
ref()(1)(0)+=result_10; \
ref()(1)(1)+=result_11; \
ref()(1)(2)+=result_12; \
ref()(2)(0)+=result_20; \
ref()(2)(1)+=result_21; \
ref()(2)(2)+=result_22; \
ref()(3)(0)+=result_30; \
ref()(3)(1)+=result_31; \
ref()(3)(2)+=result_32; \
std::cout << std::endl << "DEBUG -- RESULT EXT" << std::endl; \
std::cout << "result_00 -- " << result_00 << std::endl; \
std::cout << "result_01 -- " << result_01 << std::endl; \
std::cout << "result_02 -- " << result_02 << std::endl; \
std::cout << "result_10 -- " << result_10 << std::endl; \
std::cout << "result_11 -- " << result_11 << std::endl; \
std::cout << "result_12 -- " << result_12 << std::endl; \
std::cout << "result_20 -- " << result_20 << std::endl; \
std::cout << "result_21 -- " << result_21 << std::endl; \
std::cout << "result_22 -- " << result_22 << std::endl; \
std::cout << "result_30 -- " << result_30 << std::endl; \
std::cout << "result_31 -- " << result_31 << std::endl; \
std::cout << "result_32 -- " << result_32 << std::endl;\
}
#define HAND_DECLARATIONS(a) \
Simd result_00; \
Simd result_01; \
Simd result_02; \
Simd result_10; \
Simd result_11; \
Simd result_12; \
Simd result_20; \
Simd result_21; \
Simd result_22; \
Simd result_30; \
Simd result_31; \
Simd result_32; \
Simd Chi_00; \
Simd Chi_01; \
Simd Chi_02; \
Simd Chi_10; \
Simd Chi_11; \
Simd Chi_12; \
Simd UChi_00; \
Simd UChi_01; \
Simd UChi_02; \
Simd UChi_10; \
Simd UChi_11; \
Simd UChi_12; \
Simd U_00; \
Simd U_10; \
Simd U_20; \
Simd U_01; \
Simd U_11; \
Simd U_21;\
Simd debugreg;\
svbool_t pg1; \
pg1 = svptrue_b64(); \
#define ZERO_RESULT \
result_00=Zero(); \
result_01=Zero(); \
result_02=Zero(); \
result_10=Zero(); \
result_11=Zero(); \
result_12=Zero(); \
result_20=Zero(); \
result_21=Zero(); \
result_22=Zero(); \
result_30=Zero(); \
result_31=Zero(); \
result_32=Zero();
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_12
NAMESPACE_BEGIN(Grid);
template<class Impl> void
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
ZERO_RESULT;
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
ZERO_RESULT;
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset, ptype;
StencilEntry *SE;
int nmu=0;
ZERO_RESULT;
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT_EXT(ss);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset, ptype;
int nmu=0;
ZERO_RESULT;
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
HAND_RESULT_EXT(ss);
}
////////////// Wilson ; uses this implementation /////////////////////
NAMESPACE_END(Grid);
#undef LOAD_CHIMU
#undef LOAD_CHI
#undef MULT_2SPIN
#undef PERMUTE_DIR
#undef XP_PROJ
#undef YP_PROJ
#undef ZP_PROJ
#undef TP_PROJ
#undef XM_PROJ
#undef YM_PROJ
#undef ZM_PROJ
#undef TM_PROJ
#undef XP_RECON
#undef XP_RECON_ACCUM
#undef XM_RECON
#undef XM_RECON_ACCUM
#undef YP_RECON_ACCUM
#undef YM_RECON_ACCUM
#undef ZP_RECON_ACCUM
#undef ZM_RECON_ACCUM
#undef TP_RECON_ACCUM
#undef TM_RECON_ACCUM
#undef ZERO_RESULT
#undef Chimu_00
#undef Chimu_01
#undef Chimu_02
#undef Chimu_10
#undef Chimu_11
#undef Chimu_12
#undef Chimu_20
#undef Chimu_21
#undef Chimu_22
#undef Chimu_30
#undef Chimu_31
#undef Chimu_32
#undef HAND_STENCIL_LEG
#undef HAND_STENCIL_LEG_INT
#undef HAND_STENCIL_LEG_EXT
#undef HAND_RESULT
#undef HAND_RESULT_INT
#undef HAND_RESULT_EXT

View File

@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// All legs kernels ; comms then compute // All legs kernels ; comms then compute
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
coalescedWrite(out[sF],result,lane); coalescedWrite(out[sF],result,lane);
}; };
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Interior kernels // Interior kernels
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
coalescedWrite(out[sF], result,lane); coalescedWrite(out[sF], result,lane);
}; };
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Exterior kernels // Exterior kernels
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
} }
}; };
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
}; };
#define DhopDirMacro(Dir,spProj,spRecon) \ #define DhopDirMacro(Dir,spProj,spRecon) \
template <class Impl> accelerator_inline \ template <class Impl> \
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \ void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \ int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
{ \ { \
@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
DhopDirMacro(Zm,spProjZm,spReconZm); DhopDirMacro(Zm,spProjZm,spReconZm);
DhopDirMacro(Tm,spProjTm,spReconTm); DhopDirMacro(Tm,spProjTm,spReconTm);
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
{ {
@ -501,3 +501,4 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
#undef ASM_CALL #undef ASM_CALL
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -4,12 +4,11 @@ Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -35,13 +34,9 @@ directory
#ifndef AVX512 #ifndef AVX512
#ifndef QPX #ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h> #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif #endif
#endif #endif
#endif
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@ -49,3 +44,4 @@ NAMESPACE_BEGIN(Grid);
template class WilsonKernels<IMPLEMENTATION>; template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -37,7 +37,6 @@ directory
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h> #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmQPX.h> #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmQPX.h>
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -59,7 +59,7 @@ public:
} }
static inline GaugeLinkField static inline GaugeLinkField
CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
return Cshift(adj(Link), mu, -1); return Cshift(closure(adj(Link)), mu, -1);
} }
static inline GaugeLinkField static inline GaugeLinkField
CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {

View File

@ -140,7 +140,17 @@ private:
// Can move this outside? // Can move this outside?
typedef IntegratorType<SmearingPolicy> TheIntegrator; typedef IntegratorType<SmearingPolicy> TheIntegrator;
TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing); // Metric
//TrivialMetric<typename Implementation::Field> Mtr;
ConjugateGradient<LatticeGaugeField> CG(1.0e-8,10000);
LaplacianParams LapPar(0.0001, 1.0, 10000, 1e-8, 12, 64);
// RealD Kappa = 1.2;
RealD Kappa = Parameters.Kappa;
std::cout << GridLogMessage << "Kappa = " << Kappa << std::endl;
// Better to pass the generalised momenta to the integrator
LaplacianAdjointField<PeriodicGimplR> Laplacian(UGrid, CG, LapPar, Kappa);
TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing, Laplacian);
if (Parameters.StartingType == "HotStart") { if (Parameters.StartingType == "HotStart") {
// Hot start // Hot start

View File

@ -53,6 +53,7 @@ struct HMCparameters: Serializable {
bool, MetropolisTest, bool, MetropolisTest,
Integer, NoMetropolisUntil, Integer, NoMetropolisUntil,
std::string, StartingType, std::string, StartingType,
RealD, Kappa,
IntegratorParameters, MD) IntegratorParameters, MD)
HMCparameters() { HMCparameters() {

View File

@ -73,7 +73,8 @@ protected:
double t_U; // Track time passing on each level and for U and for P double t_U; // Track time passing on each level and for U and for P
std::vector<double> t_P; std::vector<double> t_P;
MomentaField P; // MomentaField P;
GeneralisedMomenta<FieldImplementation > P;
SmearingPolicy& Smearer; SmearingPolicy& Smearer;
RepresentationPolicy Representations; RepresentationPolicy Representations;
IntegratorParameters Params; IntegratorParameters Params;
@ -83,7 +84,7 @@ protected:
void update_P(Field& U, int level, double ep) void update_P(Field& U, int level, double ep)
{ {
t_P[level] += ep; t_P[level] += ep;
update_P(P, U, level, ep); update_P(P.Mom, U, level, ep);
std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl; std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl;
} }
@ -111,6 +112,21 @@ protected:
// input U actually not used in the fundamental case // input U actually not used in the fundamental case
// Fundamental updates, include smearing // Fundamental updates, include smearing
// Generalised momenta
// Derivative of the kinetic term must be computed before
// Mom is the momenta and gets updated by the
// actions derivatives
MomentaField MomDer(P.Mom.Grid());
P.M.ImportGauge(U);
P.DerivativeU(P.Mom, MomDer);
Mom -= MomDer * ep;
// Auxiliary fields
P.update_auxiliary_momenta(ep*0.5);
P.AuxiliaryFieldsDerivative(MomDer);
Mom -= MomDer * ep;
P.update_auxiliary_momenta(ep*0.5);
for (int a = 0; a < as[level].actions.size(); ++a) { for (int a = 0; a < as[level].actions.size(); ++a) {
double start_full = usecond(); double start_full = usecond();
Field force(U.Grid()); Field force(U.Grid());
@ -137,9 +153,83 @@ protected:
as[level].apply(update_P_hireps, Representations, Mom, U, ep); as[level].apply(update_P_hireps, Representations, Mom, U, ep);
} }
void implicit_update_P(Field& U, int level, double ep, bool intermediate = false) {
t_P[level] += ep;
std::cout << GridLogIntegrator << "[" << level << "] P "
<< " dt " << ep << " : t_P " << t_P[level] << std::endl;
// Fundamental updates, include smearing
MomentaField Msum(P.Mom.Grid());
Msum = Zero();
for (int a = 0; a < as[level].actions.size(); ++a) {
// Compute the force terms for the lagrangian part
// We need to compute the derivative of the actions
// only once
Field force(U.Grid());
conformable(U.Grid(), P.Mom.Grid());
Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta
std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
force = FieldImplementation::projectForce(force); // Ta for gauge fields
Real force_abs = std::sqrt(norm2(force) / U.Grid()->gSites());
std::cout << GridLogIntegrator << "|Force| site average: " << force_abs
<< std::endl;
Msum += force;
}
MomentaField NewMom = P.Mom;
MomentaField OldMom = P.Mom;
double threshold = 1e-8;
P.M.ImportGauge(U);
MomentaField MomDer(P.Mom.Grid());
MomentaField MomDer1(P.Mom.Grid());
MomentaField AuxDer(P.Mom.Grid());
MomDer1 = Zero();
MomentaField diff(P.Mom.Grid());
double factor = 2.0;
if (intermediate){
P.DerivativeU(P.Mom, MomDer1);
factor = 1.0;
}
// Auxiliary fields
P.update_auxiliary_momenta(ep*0.5);
P.AuxiliaryFieldsDerivative(AuxDer);
Msum += AuxDer;
// Here run recursively
int counter = 1;
RealD RelativeError;
do {
std::cout << GridLogIntegrator << "UpdateP implicit step "<< counter << std::endl;
// Compute the derivative of the kinetic term
// with respect to the gauge field
P.DerivativeU(NewMom, MomDer);
Real force_abs = std::sqrt(norm2(MomDer) / U.Grid()->gSites());
std::cout << GridLogIntegrator << "|Force| laplacian site average: " << force_abs
<< std::endl;
NewMom = P.Mom - ep* 0.5 * (2.0*Msum + factor*MomDer + MomDer1);// simplify
diff = NewMom - OldMom;
counter++;
RelativeError = std::sqrt(norm2(diff))/std::sqrt(norm2(NewMom));
std::cout << GridLogIntegrator << "UpdateP RelativeError: " << RelativeError << std::endl;
OldMom = NewMom;
} while (RelativeError > threshold);
P.Mom = NewMom;
// update the auxiliary fields momenta
P.update_auxiliary_momenta(ep*0.5);
}
void update_U(Field& U, double ep) void update_U(Field& U, double ep)
{ {
update_U(P, U, ep); update_U(P.Mom, U, ep);
t_U += ep; t_U += ep;
int fl = levels - 1; int fl = levels - 1;
@ -158,15 +248,64 @@ protected:
Representations.update(U); // void functions if fundamental representation Representations.update(U); // void functions if fundamental representation
} }
void implicit_update_U(Field&U, double ep){
t_U += ep;
int fl = levels - 1;
std::cout << GridLogIntegrator << " " << "[" << fl << "] U " << " dt " << ep << " : t_U " << t_U << std::endl;
MomentaField Mom1(P.Mom.Grid());
MomentaField Mom2(P.Mom.Grid());
RealD RelativeError;
Field diff(U.Grid());
Real threshold = 1e-8;
int counter = 1;
int MaxCounter = 100;
Field OldU = U;
Field NewU = U;
P.M.ImportGauge(U);
P.DerivativeP(Mom1); // first term in the derivative
P.update_auxiliary_fields(ep*0.5);
MomentaField sum=Mom1;
do {
std::cout << GridLogIntegrator << "UpdateU implicit step "<< counter << std::endl;
P.DerivativeP(Mom2); // second term in the derivative, on the updated U
sum = (Mom1 + Mom2);
for (int mu = 0; mu < Nd; mu++) {
auto Umu = PeekIndex<LorentzIndex>(U, mu);
auto Pmu = PeekIndex<LorentzIndex>(sum, mu);
Umu = expMat(Pmu, ep * 0.5, 12) * Umu;
PokeIndex<LorentzIndex>(NewU, ProjectOnGroup(Umu), mu);
}
diff = NewU - OldU;
RelativeError = std::sqrt(norm2(diff))/std::sqrt(norm2(NewU));
std::cout << GridLogIntegrator << "UpdateU RelativeError: " << RelativeError << std::endl;
P.M.ImportGauge(NewU);
OldU = NewU; // some redundancy to be eliminated
counter++;
} while (RelativeError > threshold && counter < MaxCounter);
U = NewU;
P.update_auxiliary_fields(ep*0.5);
}
virtual void step(Field& U, int level, int first, int last) = 0; virtual void step(Field& U, int level, int first, int last) = 0;
public: public:
Integrator(GridBase* grid, IntegratorParameters Par, Integrator(GridBase* grid, IntegratorParameters Par,
ActionSet<Field, RepresentationPolicy>& Aset, ActionSet<Field, RepresentationPolicy>& Aset,
SmearingPolicy& Sm) SmearingPolicy& Sm, Metric<MomentaField>& M)
: Params(Par), : Params(Par),
as(Aset), as(Aset),
P(grid), P(grid, M),
levels(Aset.size()), levels(Aset.size()),
Smearer(Sm), Smearer(Sm),
Representations(grid) Representations(grid)
@ -203,7 +342,9 @@ public:
void reverse_momenta() void reverse_momenta()
{ {
P *= -1.0; // P *= -1.0;
P.Mom *= -1.0;
P.AuxMom *= -1.0;
} }
// to be used by the actionlevel class to iterate // to be used by the actionlevel class to iterate
@ -223,10 +364,13 @@ public:
// Initialization of momenta and actions // Initialization of momenta and actions
void refresh(Field& U, GridParallelRNG& pRNG) void refresh(Field& U, GridParallelRNG& pRNG)
{ {
assert(P.Grid() == U.Grid()); assert(P.Mom.Grid() == U.Grid());
std::cout << GridLogIntegrator << "Integrator refresh\n"; std::cout << GridLogIntegrator << "Integrator refresh\n";
FieldImplementation::generate_momenta(P, pRNG); // FieldImplementation::generate_momenta(P.Mom, pRNG);
P.M.ImportGauge(U);
P.MomentaDistribution(pRNG);
// Update the smeared fields, can be implemented as observer // Update the smeared fields, can be implemented as observer
// necessary to keep the fields updated even after a reject // necessary to keep the fields updated even after a reject
@ -272,9 +416,11 @@ public:
std::cout << GridLogIntegrator << "Integrator action\n"; std::cout << GridLogIntegrator << "Integrator action\n";
RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom // RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
P.M.ImportGauge(U);
RealD H = - P.MomentaAction();
RealD Hterm; RealD Hterm;
std::cout << GridLogMessage << "Momentum action H_p = " << H << "\n";
// Actions // Actions
for (int level = 0; level < as.size(); ++level) { for (int level = 0; level < as.size(); ++level) {
@ -301,9 +447,9 @@ public:
t_P[level] = 0; t_P[level] = 0;
} }
for (int stp = 0; stp < Params.MDsteps; ++stp) { // MD step for (int step = 0; step < Params.MDsteps; ++step) { // MD step
int first_step = (stp == 0); int first_step = (step == 0);
int last_step = (stp == Params.MDsteps - 1); int last_step = (step == Params.MDsteps - 1);
this->step(U, 0, first_step, last_step); this->step(U, 0, first_step, last_step);
} }

View File

@ -101,8 +101,8 @@ public:
std::string integrator_name(){return "LeapFrog";} std::string integrator_name(){return "LeapFrog";}
LeapFrog(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm) LeapFrog(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
: Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm){}; : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm,M){};
void step(Field& U, int level, int _first, int _last) { void step(Field& U, int level, int _first, int _last) {
int fl = this->as.size() - 1; int fl = this->as.size() - 1;
@ -144,8 +144,8 @@ private:
public: public:
INHERIT_FIELD_TYPES(FieldImplementation); INHERIT_FIELD_TYPES(FieldImplementation);
MinimumNorm2(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm) MinimumNorm2(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
: Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm){}; : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm,M){};
std::string integrator_name(){return "MininumNorm2";} std::string integrator_name(){return "MininumNorm2";}
@ -207,9 +207,9 @@ public:
// Looks like dH scales as dt^4. tested wilson/wilson 2 level. // Looks like dH scales as dt^4. tested wilson/wilson 2 level.
ForceGradient(GridBase* grid, IntegratorParameters Par, ForceGradient(GridBase* grid, IntegratorParameters Par,
ActionSet<Field, RepresentationPolicy>& Aset, ActionSet<Field, RepresentationPolicy>& Aset,
SmearingPolicy& Sm) SmearingPolicy& Sm, Metric<Field>& M)
: Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>( : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
grid, Par, Aset, Sm){}; grid, Par, Aset, Sm,M){};
std::string integrator_name(){return "ForceGradient";} std::string integrator_name(){return "ForceGradient";}
@ -271,6 +271,139 @@ public:
} }
}; };
////////////////////////////////
// Riemannian Manifold HMC
// Girolami et al
////////////////////////////////
// correct
template <class FieldImplementation, class SmearingPolicy,
class RepresentationPolicy =
Representations<FundamentalRepresentation> >
class ImplicitLeapFrog : public Integrator<FieldImplementation, SmearingPolicy,
RepresentationPolicy> {
public:
typedef ImplicitLeapFrog<FieldImplementation, SmearingPolicy, RepresentationPolicy>
Algorithm;
INHERIT_FIELD_TYPES(FieldImplementation);
// Riemannian manifold metric operator
// Hermitian operator Fisher
std::string integrator_name(){return "ImplicitLeapFrog";}
ImplicitLeapFrog(GridBase* grid, IntegratorParameters Par,
ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
: Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
grid, Par, Aset, Sm, M){};
void step(Field& U, int level, int _first, int _last) {
int fl = this->as.size() - 1;
// level : current level
// fl : final level
// eps : current step size
// Get current level step size
RealD eps = this->Params.trajL/this->Params.MDsteps;
for (int l = 0; l <= level; ++l) eps /= this->as[l].multiplier;
int multiplier = this->as[level].multiplier;
for (int e = 0; e < multiplier; ++e) {
int first_step = _first && (e == 0);
int last_step = _last && (e == multiplier - 1);
if (first_step) { // initial half step
this->implicit_update_P(U, level, eps / 2.0);
}
if (level == fl) { // lowest level
this->implicit_update_U(U, eps);
} else { // recursive function call
this->step(U, level + 1, first_step, last_step);
}
//int mm = last_step ? 1 : 2;
if (last_step){
this->update_P(U, level, eps / 2.0);
} else {
this->implicit_update_P(U, level, eps, true);// works intermediate step
// this->update_P(U, level, eps); // looks not reversible
}
}
}
};
// This is not completely tested
template <class FieldImplementation, class SmearingPolicy,
class RepresentationPolicy =
Representations<FundamentalRepresentation> >
class ImplicitMinimumNorm2 : public Integrator<FieldImplementation, SmearingPolicy,
RepresentationPolicy> {
private:
const RealD lambda = 0.1931833275037836;
public:
INHERIT_FIELD_TYPES(FieldImplementation);
ImplicitMinimumNorm2(GridBase* grid, IntegratorParameters Par,
ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
: Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
grid, Par, Aset, Sm, M){};
std::string integrator_name(){return "ImplicitMininumNorm2";}
void step(Field& U, int level, int _first, int _last) {
// level : current level
// fl : final level
// eps : current step size
int fl = this->as.size() - 1;
RealD eps = this->Params.trajL/this->Params.MDsteps * 2.0;
for (int l = 0; l <= level; ++l) eps /= 2.0 * this->as[l].multiplier;
// Nesting: 2xupdate_U of size eps/2
// Next level is eps/2/multiplier
int multiplier = this->as[level].multiplier;
for (int e = 0; e < multiplier; ++e) { // steps per step
int first_step = _first && (e == 0);
int last_step = _last && (e == multiplier - 1);
if (first_step) { // initial half step
this->implicit_update_P(U, level, lambda * eps);
}
if (level == fl) { // lowest level
this->implicit_update_U(U, 0.5 * eps);
} else { // recursive function call
this->step(U, level + 1, first_step, 0);
}
this->implicit_update_P(U, level, (1.0 - 2.0 * lambda) * eps, true);
if (level == fl) { // lowest level
this->implicit_update_U(U, 0.5 * eps);
} else { // recursive function call
this->step(U, level + 1, 0, last_step);
}
//int mm = (last_step) ? 1 : 2;
//this->update_P(U, level, lambda * eps * mm);
if (last_step) {
this->update_P(U, level, eps * lambda);
} else {
this->implicit_update_P(U, level, lambda * eps*2.0, true);
}
}
}
};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif // INTEGRATOR_INCLUDED #endif // INTEGRATOR_INCLUDED

View File

@ -53,21 +53,23 @@ namespace PeriodicBC {
return Cshift(tmp,mu,-1);// moves towards positive mu return Cshift(tmp,mu,-1);// moves towards positive mu
} }
template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr> template<class gauge,typename Op, typename T1> auto
auto CovShiftForward(const Lattice<gauge> &Link, CovShiftForward(const Lattice<gauge> &Link,
int mu, int mu,
const Expr &expr) -> decltype(closure(expr)) const LatticeUnaryExpression<Op,T1> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{ {
auto arg = closure(expr); Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
return CovShiftForward(Link,mu,arg); return CovShiftForward(Link,mu,arg);
} }
template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr> template<class gauge,typename Op, typename T1> auto
auto CovShiftBackward(const Lattice<gauge> &Link, CovShiftBackward(const Lattice<gauge> &Link,
int mu, int mu,
const Expr &expr) -> decltype(closure(expr)) const LatticeUnaryExpression<Op,T1> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{ {
auto arg = closure(expr); Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
return CovShiftBackward(Link,mu,arg); return CovShiftForward(Link,mu,arg);
} }
} }
@ -140,23 +142,26 @@ namespace ConjugateBC {
return Cshift(tmp,mu,-1);// moves towards positive mu return Cshift(tmp,mu,-1);// moves towards positive mu
} }
template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr> template<class gauge,typename Op, typename T1> auto
auto CovShiftForward(const Lattice<gauge> &Link, CovShiftForward(const Lattice<gauge> &Link,
int mu, int mu,
const Expr &expr) -> decltype(closure(expr)) const LatticeUnaryExpression<Op,T1> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{ {
auto arg = closure(expr); Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
return CovShiftForward(Link,mu,arg); return CovShiftForward(Link,mu,arg);
} }
template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr> template<class gauge,typename Op, typename T1> auto
auto CovShiftBackward(const Lattice<gauge> &Link, CovShiftBackward(const Lattice<gauge> &Link,
int mu, int mu,
const Expr &expr) -> decltype(closure(expr)) const LatticeUnaryExpression<Op,T1> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{ {
auto arg = closure(expr); Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
return CovShiftBackward(Link,mu,arg); return CovShiftForward(Link,mu,arg);
} }
} }

View File

@ -39,7 +39,7 @@ public:
typedef iSUnAdjointMatrix<ComplexF> AMatrixF; typedef iSUnAdjointMatrix<ComplexF> AMatrixF;
typedef iSUnAdjointMatrix<ComplexD> AMatrixD; typedef iSUnAdjointMatrix<ComplexD> AMatrixD;
typedef iSUnAdjointMatrix<vComplex> vAMatrix; typedef iSUnAdjointMatrix<vComplex> vAMatrix;
typedef iSUnAdjointMatrix<vComplexF> vAMatrixF; typedef iSUnAdjointMatrix<vComplexF> vAMatrixF;
typedef iSUnAdjointMatrix<vComplexD> vAMatrixD; typedef iSUnAdjointMatrix<vComplexD> vAMatrixD;
@ -47,9 +47,14 @@ public:
typedef Lattice<vAMatrixF> LatticeAdjMatrixF; typedef Lattice<vAMatrixF> LatticeAdjMatrixF;
typedef Lattice<vAMatrixD> LatticeAdjMatrixD; typedef Lattice<vAMatrixD> LatticeAdjMatrixD;
typedef Lattice<iVector<iScalar<iMatrix<vComplex, Dimension> >, Nd> > LatticeAdjField; typedef Lattice<iVector<iScalar<iMatrix<vComplex, Dimension> >, Nd> >
typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> > LatticeAdjFieldF; LatticeAdjField;
typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> > LatticeAdjFieldD; typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> >
LatticeAdjFieldF;
typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> >
LatticeAdjFieldD;
template <class cplx> template <class cplx>
@ -123,9 +128,7 @@ public:
} }
// Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 ) // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 )
static void projectOnAlgebra(typename SU<ncolour>::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) static void projectOnAlgebra(typename SU<ncolour>::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) {
{
conformable(h_out, in); conformable(h_out, in);
h_out = Zero(); h_out = Zero();
AMatrix iTa; AMatrix iTa;
@ -133,7 +136,7 @@ public:
for (int a = 0; a < Dimension; a++) { for (int a = 0; a < Dimension; a++) {
generator(a, iTa); generator(a, iTa);
LatticeComplex tmp = real(trace(iTa * in)) * coefficient; auto tmp = real(trace(iTa * in)) * coefficient;
pokeColour(h_out, tmp, a); pokeColour(h_out, tmp, a);
} }
} }

View File

@ -485,7 +485,7 @@ public:
// Up staple ___ ___ // Up staple ___ ___
// | | // | |
tmp = Cshift(adj(U[nu]), nu, -1); tmp = Cshift(closure(adj(U[nu])), nu, -1);
tmp = adj(U2[mu]) * tmp; tmp = adj(U2[mu]) * tmp;
tmp = Cshift(tmp, mu, -2); tmp = Cshift(tmp, mu, -2);
@ -519,7 +519,7 @@ public:
// //
// | | // | |
tmp = Cshift(adj(U2[nu]), nu, -2); tmp = Cshift(closure(adj(U2[nu])), nu, -2);
tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp); tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
tmp = U2[nu] * Cshift(tmp, nu, 2); tmp = U2[nu] * Cshift(tmp, nu, 2);
Stap += Cshift(tmp, mu, 1); Stap += Cshift(tmp, mu, 1);

View File

@ -1,779 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Fujitsu_A64FX_asm_double.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)
#define PF_GAUGE(A)
#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A)
#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A)
#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
#define LOCK_GAUGE(A)
#define UNLOCK_GAUGE(A)
#define MASK_REGS DECLARATIONS_A64FXd
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJ XP_PROJ_A64FXd
#define YP_PROJ YP_PROJ_A64FXd
#define ZP_PROJ ZP_PROJ_A64FXd
#define TP_PROJ TP_PROJ_A64FXd
#define XM_PROJ XM_PROJ_A64FXd
#define YM_PROJ YM_PROJ_A64FXd
#define ZM_PROJ ZM_PROJ_A64FXd
#define TM_PROJ TM_PROJ_A64FXd
#define XP_RECON XP_RECON_A64FXd
#define XM_RECON XM_RECON_A64FXd
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd
#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
#define PERMUTE_DIR0 0
#define PERMUTE_DIR1 1
#define PERMUTE_DIR2 2
#define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXd;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
// DECLARATIONS
#define DECLARATIONS_A64FXd \
const uint64_t lut[4][8] = { \
{4, 5, 6, 7, 0, 1, 2, 3}, \
{2, 3, 0, 1, 6, 7, 4, 5}, \
{1, 0, 3, 2, 5, 4, 7, 6}, \
{0, 1, 2, 4, 5, 6, 7, 8} };\
asm ( \
"fmov z31.d , 0 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// RESULT
#define RESULT_A64FXd(base) \
{ \
asm ( \
"str z0, [%[storeptr], -6, mul vl] \n\t" \
"str z1, [%[storeptr], -5, mul vl] \n\t" \
"str z2, [%[storeptr], -4, mul vl] \n\t" \
"str z3, [%[storeptr], -3, mul vl] \n\t" \
"str z4, [%[storeptr], -2, mul vl] \n\t" \
"str z5, [%[storeptr], -1, mul vl] \n\t" \
"str z6, [%[storeptr], 0, mul vl] \n\t" \
"str z7, [%[storeptr], 1, mul vl] \n\t" \
"str z8, [%[storeptr], 2, mul vl] \n\t" \
"str z9, [%[storeptr], 3, mul vl] \n\t" \
"str z10, [%[storeptr], 4, mul vl] \n\t" \
"str z11, [%[storeptr], 5, mul vl] \n\t" \
: \
: [storeptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_CHIMU_L2 (prefetch to L2)
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \
{ \
asm ( \
"prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_CHIMU_L1 (prefetch to L1)
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \
{ \
asm ( \
"prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_GAUGE_L2 (prefetch to L2)
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \
{ \
const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
asm ( \
"prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_GAUGE_L1 (prefetch to L1)
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
"prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHI
#define LOAD_CHI_A64FXd(base) \
{ \
asm ( \
"ldr z12, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z13, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z14, [%[fetchptr], 2, mul vl] \n\t" \
"ldr z15, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z16, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z17, [%[fetchptr], 5, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \
{ \
asm ( \
"ptrue p5.d \n\t" \
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU_0213
#define LOAD_CHIMU_0213_A64FXd \
{ \
const SiteSpinor & ref(in[offset]); \
asm ( \
"ptrue p5.d \n\t" \
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
: \
: [fetchptr] "r" (&ref[2][0]) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU_0312
#define LOAD_CHIMU_0312_A64FXd \
{ \
const SiteSpinor & ref(in[offset]); \
asm ( \
"ptrue p5.d \n\t" \
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (&ref[2][0]) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_TABLE0
#define LOAD_TABLE0 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE1
#define LOAD_TABLE1 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE2
#define LOAD_TABLE2 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE3
#define LOAD_TABLE3 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERMUTE
#define PERMUTE_A64FXd \
asm ( \
"tbl z12.d, { z12.d }, z30.d \n\t" \
"tbl z13.d, { z13.d }, z30.d \n\t" \
"tbl z14.d, { z14.d }, z30.d \n\t" \
"tbl z15.d, { z15.d }, z30.d \n\t" \
"tbl z16.d, { z16.d }, z30.d \n\t" \
"tbl z17.d, { z17.d }, z30.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_GAUGE
#define LOAD_GAUGE \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
{ \
asm ( \
"ptrue p5.d \n\t" \
"ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// MULT_2SPIN
#define MULT_2SPIN_1_A64FXd(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
"ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
"movprfx z18.d, p5/m, z31.d \n\t" \
"fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \
"movprfx z21.d, p5/m, z31.d \n\t" \
"fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \
"movprfx z19.d, p5/m, z31.d \n\t" \
"fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \
"movprfx z22.d, p5/m, z31.d \n\t" \
"fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \
"movprfx z20.d, p5/m, z31.d \n\t" \
"fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \
"movprfx z23.d, p5/m, z31.d \n\t" \
"fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \
"fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \
"fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \
"fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \
"fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \
"fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \
"fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXd \
{ \
asm ( \
"fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
"fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
"fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
"fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \
"fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \
"fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \
"fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \
"fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \
"fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \
"fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \
"fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \
"fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \
"fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \
"fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \
"fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \
"fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \
"fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \
"fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \
"fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \
"fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \
"fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \
"fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \
"fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
"fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_PROJ
#define XP_PROJ_A64FXd \
{ \
asm ( \
"fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \
"fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \
"fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \
"fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \
"fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \
"fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_RECON
#define XP_RECON_A64FXd \
asm ( \
"movprfx z6.d, p5/m, z31.d \n\t" \
"fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
"movprfx z7.d, p5/m, z31.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
"movprfx z8.d, p5/m, z31.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
"movprfx z9.d, p5/m, z31.d \n\t" \
"fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
"movprfx z10.d, p5/m, z31.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
"movprfx z11.d, p5/m, z31.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
"mov z0.d, p5/m, z18.d \n\t" \
"mov z1.d, p5/m, z19.d \n\t" \
"mov z2.d, p5/m, z20.d \n\t" \
"mov z3.d, p5/m, z21.d \n\t" \
"mov z4.d, p5/m, z22.d \n\t" \
"mov z5.d, p5/m, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// XP_RECON_ACCUM
#define XP_RECON_ACCUM_A64FXd \
asm ( \
"fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YP_PROJ
#define YP_PROJ_A64FXd \
{ \
asm ( \
"fsub z12.d, p5/m, z12.d, z21.d \n\t" \
"fsub z13.d, p5/m, z13.d, z22.d \n\t" \
"fsub z14.d, p5/m, z14.d, z23.d \n\t" \
"fadd z15.d, p5/m, z15.d, z18.d \n\t" \
"fadd z16.d, p5/m, z16.d, z19.d \n\t" \
"fadd z17.d, p5/m, z17.d, z20.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZP_PROJ
#define ZP_PROJ_A64FXd \
{ \
asm ( \
"fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \
"fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \
"fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \
"fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \
"fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \
"fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TP_PROJ
#define TP_PROJ_A64FXd \
{ \
asm ( \
"fadd z12.d, p5/m, z12.d, z18.d \n\t" \
"fadd z13.d, p5/m, z13.d, z19.d \n\t" \
"fadd z14.d, p5/m, z14.d, z20.d \n\t" \
"fadd z15.d, p5/m, z15.d, z21.d \n\t" \
"fadd z16.d, p5/m, z16.d, z22.d \n\t" \
"fadd z17.d, p5/m, z17.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_PROJ
#define XM_PROJ_A64FXd \
{ \
asm ( \
"fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \
"fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \
"fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \
"fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \
"fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \
"fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON
#define XM_RECON_A64FXd \
asm ( \
"movprfx z6.d, p5/m, z31.d \n\t" \
"fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
"movprfx z7.d, p5/m, z31.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
"movprfx z8.d, p5/m, z31.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
"movprfx z9.d, p5/m, z31.d \n\t" \
"fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
"movprfx z10.d, p5/m, z31.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
"movprfx z11.d, p5/m, z31.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
"mov z0.d, p5/m, z18.d \n\t" \
"mov z1.d, p5/m, z19.d \n\t" \
"mov z2.d, p5/m, z20.d \n\t" \
"mov z3.d, p5/m, z21.d \n\t" \
"mov z4.d, p5/m, z22.d \n\t" \
"mov z5.d, p5/m, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YM_PROJ
#define YM_PROJ_A64FXd \
{ \
asm ( \
"fadd z12.d, p5/m, z12.d, z21.d \n\t" \
"fadd z13.d, p5/m, z13.d, z22.d \n\t" \
"fadd z14.d, p5/m, z14.d, z23.d \n\t" \
"fsub z15.d, p5/m, z15.d, z18.d \n\t" \
"fsub z16.d, p5/m, z16.d, z19.d \n\t" \
"fsub z17.d, p5/m, z17.d, z20.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZM_PROJ
#define ZM_PROJ_A64FXd \
{ \
asm ( \
"fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \
"fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \
"fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \
"fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \
"fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \
"fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TM_PROJ
#define TM_PROJ_A64FXd \
{ \
asm ( \
"ptrue p5.d \n\t" \
"fsub z12.d, p5/m, z12.d, z18.d \n\t" \
"fsub z13.d, p5/m, z13.d, z19.d \n\t" \
"fsub z14.d, p5/m, z14.d, z20.d \n\t" \
"fsub z15.d, p5/m, z15.d, z21.d \n\t" \
"fsub z16.d, p5/m, z16.d, z22.d \n\t" \
"fsub z17.d, p5/m, z17.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON_ACCUM
#define XM_RECON_ACCUM_A64FXd \
asm ( \
"fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
"fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
"fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
"fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
"fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
"fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fsub z9.d, p5/m, z9.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fsub z10.d, p5/m, z10.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fsub z11.d, p5/m, z11.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fadd z6.d, p5/m, z6.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fadd z7.d, p5/m, z7.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
"fadd z8.d, p5/m, z8.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YM_RECON_ACCUM
#define YM_RECON_ACCUM_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fadd z9.d, p5/m, z9.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fadd z10.d, p5/m, z10.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fadd z11.d, p5/m, z11.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fsub z6.d, p5/m, z6.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fsub z7.d, p5/m, z7.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
"fsub z8.d, p5/m, z8.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZP_RECON_ACCUM
#define ZP_RECON_ACCUM_A64FXd \
asm ( \
"fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZM_RECON_ACCUM
#define ZM_RECON_ACCUM_A64FXd \
asm ( \
"fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// TP_RECON_ACCUM
#define TP_RECON_ACCUM_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fadd z6.d, p5/m, z6.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fadd z7.d, p5/m, z7.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fadd z8.d, p5/m, z8.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fadd z9.d, p5/m, z9.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fadd z10.d, p5/m, z10.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
"fadd z11.d, p5/m, z11.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// TM_RECON_ACCUM
#define TM_RECON_ACCUM_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z18.d \n\t" \
"fsub z6.d, p5/m, z6.d, z18.d \n\t" \
"fadd z1.d, p5/m, z1.d, z19.d \n\t" \
"fsub z7.d, p5/m, z7.d, z19.d \n\t" \
"fadd z2.d, p5/m, z2.d, z20.d \n\t" \
"fsub z8.d, p5/m, z8.d, z20.d \n\t" \
"fadd z3.d, p5/m, z3.d, z21.d \n\t" \
"fsub z9.d, p5/m, z9.d, z21.d \n\t" \
"fadd z4.d, p5/m, z4.d, z22.d \n\t" \
"fsub z10.d, p5/m, z10.d, z22.d \n\t" \
"fadd z5.d, p5/m, z5.d, z23.d \n\t" \
"fsub z11.d, p5/m, z11.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZERO_PSI
#define ZERO_PSI_A64FXd \
asm ( \
"ptrue p5.d \n\t" \
"fmov z0.d , 0 \n\t" \
"fmov z1.d , 0 \n\t" \
"fmov z2.d , 0 \n\t" \
"fmov z3.d , 0 \n\t" \
"fmov z4.d , 0 \n\t" \
"fmov z5.d , 0 \n\t" \
"fmov z6.d , 0 \n\t" \
"fmov z7.d , 0 \n\t" \
"fmov z8.d , 0 \n\t" \
"fmov z9.d , 0 \n\t" \
"fmov z10.d , 0 \n\t" \
"fmov z11.d , 0 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \
{ \
asm ( \
"prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \
{ \
asm ( \
"prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXd \
asm ( \
"fadd z0.d, p5/m, z0.d, z12.d \n\t" \
"fadd z1.d, p5/m, z1.d, z13.d \n\t" \
"fadd z2.d, p5/m, z2.d, z14.d \n\t" \
"fadd z3.d, p5/m, z3.d, z15.d \n\t" \
"fadd z4.d, p5/m, z4.d, z16.d \n\t" \
"fadd z5.d, p5/m, z5.d, z17.d \n\t" \
"fadd z6.d, p5/m, z6.d, z18.d \n\t" \
"fadd z7.d, p5/m, z7.d, z19.d \n\t" \
"fadd z8.d, p5/m, z8.d, z20.d \n\t" \
"fadd z9.d, p5/m, z9.d, z21.d \n\t" \
"fadd z10.d, p5/m, z10.d, z22.d \n\t" \
"fadd z11.d, p5/m, z11.d, z23.d \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);

View File

@ -1,779 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Fujitsu_A64FX_asm_single.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)
#define PF_GAUGE(A)
#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A)
#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A)
#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
#define LOCK_GAUGE(A)
#define UNLOCK_GAUGE(A)
#define MASK_REGS DECLARATIONS_A64FXf
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJ XP_PROJ_A64FXf
#define YP_PROJ YP_PROJ_A64FXf
#define ZP_PROJ ZP_PROJ_A64FXf
#define TP_PROJ TP_PROJ_A64FXf
#define XM_PROJ XM_PROJ_A64FXf
#define YM_PROJ YM_PROJ_A64FXf
#define ZM_PROJ ZM_PROJ_A64FXf
#define TM_PROJ TM_PROJ_A64FXf
#define XP_RECON XP_RECON_A64FXf
#define XM_RECON XM_RECON_A64FXf
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf
#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 0
#define PERMUTE_DIR1 1
#define PERMUTE_DIR2 2
#define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXf;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
// DECLARATIONS
#define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \
{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
{2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \
asm ( \
"fmov z31.s , 0 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// RESULT
#define RESULT_A64FXf(base) \
{ \
asm ( \
"str z0, [%[storeptr], -6, mul vl] \n\t" \
"str z1, [%[storeptr], -5, mul vl] \n\t" \
"str z2, [%[storeptr], -4, mul vl] \n\t" \
"str z3, [%[storeptr], -3, mul vl] \n\t" \
"str z4, [%[storeptr], -2, mul vl] \n\t" \
"str z5, [%[storeptr], -1, mul vl] \n\t" \
"str z6, [%[storeptr], 0, mul vl] \n\t" \
"str z7, [%[storeptr], 1, mul vl] \n\t" \
"str z8, [%[storeptr], 2, mul vl] \n\t" \
"str z9, [%[storeptr], 3, mul vl] \n\t" \
"str z10, [%[storeptr], 4, mul vl] \n\t" \
"str z11, [%[storeptr], 5, mul vl] \n\t" \
: \
: [storeptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_CHIMU_L2 (prefetch to L2)
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \
{ \
asm ( \
"prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_CHIMU_L1 (prefetch to L1)
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \
{ \
asm ( \
"prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_GAUGE_L2 (prefetch to L2)
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \
{ \
const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
asm ( \
"prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
"prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_GAUGE_L1 (prefetch to L1)
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
"prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHI
#define LOAD_CHI_A64FXf(base) \
{ \
asm ( \
"ldr z12, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z13, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z14, [%[fetchptr], 2, mul vl] \n\t" \
"ldr z15, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z16, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z17, [%[fetchptr], 5, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \
{ \
asm ( \
"ptrue p5.s \n\t" \
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (base + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU_0213
#define LOAD_CHIMU_0213_A64FXf \
{ \
const SiteSpinor & ref(in[offset]); \
asm ( \
"ptrue p5.s \n\t" \
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
: \
: [fetchptr] "r" (&ref[2][0]) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_CHIMU_0312
#define LOAD_CHIMU_0312_A64FXf \
{ \
const SiteSpinor & ref(in[offset]); \
asm ( \
"ptrue p5.s \n\t" \
"ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
"ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
"ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
"ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
"ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (&ref[2][0]) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// LOAD_TABLE0
#define LOAD_TABLE0 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (0) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE1
#define LOAD_TABLE1 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (1) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE2
#define LOAD_TABLE2 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (2) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_TABLE3
#define LOAD_TABLE3 \
asm ( \
"ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
: \
: [tableptr] "r" (&lut[0]),[index] "i" (3) \
: "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PERMUTE
#define PERMUTE_A64FXf \
asm ( \
"tbl z12.s, { z12.s }, z30.s \n\t" \
"tbl z13.s, { z13.s }, z30.s \n\t" \
"tbl z14.s, { z14.s }, z30.s \n\t" \
"tbl z15.s, { z15.s }, z30.s \n\t" \
"tbl z16.s, { z16.s }, z30.s \n\t" \
"tbl z17.s, { z17.s }, z30.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// LOAD_GAUGE
#define LOAD_GAUGE \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
{ \
asm ( \
"ptrue p5.s \n\t" \
"ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// MULT_2SPIN
#define MULT_2SPIN_1_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
asm ( \
"ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
"ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
"ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
"ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
"movprfx z18.s, p5/m, z31.s \n\t" \
"fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \
"movprfx z21.s, p5/m, z31.s \n\t" \
"fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \
"movprfx z19.s, p5/m, z31.s \n\t" \
"fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \
"movprfx z22.s, p5/m, z31.s \n\t" \
"fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \
"movprfx z20.s, p5/m, z31.s \n\t" \
"fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \
"movprfx z23.s, p5/m, z31.s \n\t" \
"fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \
"fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \
"fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \
"fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \
"fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \
"fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \
"fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \
"ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
"ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
"ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
: \
: [fetchptr] "r" (baseU + 2 * 3 * 64) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXf \
{ \
asm ( \
"fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \
"fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \
"fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \
"fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \
"fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \
"fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \
"fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \
"fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \
"fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \
"fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \
"fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \
"fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \
"fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \
"fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \
"fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \
"fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \
"fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \
"fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \
"fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \
"fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \
"fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \
"fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \
"fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \
"fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_PROJ
#define XP_PROJ_A64FXf \
{ \
asm ( \
"fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \
"fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \
"fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \
"fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \
"fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \
"fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XP_RECON
#define XP_RECON_A64FXf \
asm ( \
"movprfx z6.s, p5/m, z31.s \n\t" \
"fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
"movprfx z7.s, p5/m, z31.s \n\t" \
"fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
"movprfx z8.s, p5/m, z31.s \n\t" \
"fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
"movprfx z9.s, p5/m, z31.s \n\t" \
"fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
"movprfx z10.s, p5/m, z31.s \n\t" \
"fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
"movprfx z11.s, p5/m, z31.s \n\t" \
"fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
"mov z0.s, p5/m, z18.s \n\t" \
"mov z1.s, p5/m, z19.s \n\t" \
"mov z2.s, p5/m, z20.s \n\t" \
"mov z3.s, p5/m, z21.s \n\t" \
"mov z4.s, p5/m, z22.s \n\t" \
"mov z5.s, p5/m, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// XP_RECON_ACCUM
#define XP_RECON_ACCUM_A64FXf \
asm ( \
"fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YP_PROJ
#define YP_PROJ_A64FXf \
{ \
asm ( \
"fsub z12.s, p5/m, z12.s, z21.s \n\t" \
"fsub z13.s, p5/m, z13.s, z22.s \n\t" \
"fsub z14.s, p5/m, z14.s, z23.s \n\t" \
"fadd z15.s, p5/m, z15.s, z18.s \n\t" \
"fadd z16.s, p5/m, z16.s, z19.s \n\t" \
"fadd z17.s, p5/m, z17.s, z20.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZP_PROJ
#define ZP_PROJ_A64FXf \
{ \
asm ( \
"fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \
"fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \
"fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \
"fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \
"fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \
"fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TP_PROJ
#define TP_PROJ_A64FXf \
{ \
asm ( \
"fadd z12.s, p5/m, z12.s, z18.s \n\t" \
"fadd z13.s, p5/m, z13.s, z19.s \n\t" \
"fadd z14.s, p5/m, z14.s, z20.s \n\t" \
"fadd z15.s, p5/m, z15.s, z21.s \n\t" \
"fadd z16.s, p5/m, z16.s, z22.s \n\t" \
"fadd z17.s, p5/m, z17.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_PROJ
#define XM_PROJ_A64FXf \
{ \
asm ( \
"fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \
"fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \
"fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \
"fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \
"fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \
"fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON
#define XM_RECON_A64FXf \
asm ( \
"movprfx z6.s, p5/m, z31.s \n\t" \
"fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
"movprfx z7.s, p5/m, z31.s \n\t" \
"fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
"movprfx z8.s, p5/m, z31.s \n\t" \
"fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
"movprfx z9.s, p5/m, z31.s \n\t" \
"fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
"movprfx z10.s, p5/m, z31.s \n\t" \
"fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
"movprfx z11.s, p5/m, z31.s \n\t" \
"fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
"mov z0.s, p5/m, z18.s \n\t" \
"mov z1.s, p5/m, z19.s \n\t" \
"mov z2.s, p5/m, z20.s \n\t" \
"mov z3.s, p5/m, z21.s \n\t" \
"mov z4.s, p5/m, z22.s \n\t" \
"mov z5.s, p5/m, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YM_PROJ
#define YM_PROJ_A64FXf \
{ \
asm ( \
"fadd z12.s, p5/m, z12.s, z21.s \n\t" \
"fadd z13.s, p5/m, z13.s, z22.s \n\t" \
"fadd z14.s, p5/m, z14.s, z23.s \n\t" \
"fsub z15.s, p5/m, z15.s, z18.s \n\t" \
"fsub z16.s, p5/m, z16.s, z19.s \n\t" \
"fsub z17.s, p5/m, z17.s, z20.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// ZM_PROJ
#define ZM_PROJ_A64FXf \
{ \
asm ( \
"fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \
"fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \
"fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \
"fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \
"fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \
"fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// TM_PROJ
#define TM_PROJ_A64FXf \
{ \
asm ( \
"ptrue p5.s \n\t" \
"fsub z12.s, p5/m, z12.s, z18.s \n\t" \
"fsub z13.s, p5/m, z13.s, z19.s \n\t" \
"fsub z14.s, p5/m, z14.s, z20.s \n\t" \
"fsub z15.s, p5/m, z15.s, z21.s \n\t" \
"fsub z16.s, p5/m, z16.s, z22.s \n\t" \
"fsub z17.s, p5/m, z17.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
); \
}
// XM_RECON_ACCUM
#define XM_RECON_ACCUM_A64FXf \
asm ( \
"fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
"fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
"fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
"fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
"fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
"fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXf \
asm ( \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fsub z9.s, p5/m, z9.s, z18.s \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fsub z10.s, p5/m, z10.s, z19.s \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fsub z11.s, p5/m, z11.s, z20.s \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fadd z6.s, p5/m, z6.s, z21.s \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fadd z7.s, p5/m, z7.s, z22.s \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
"fadd z8.s, p5/m, z8.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// YM_RECON_ACCUM
#define YM_RECON_ACCUM_A64FXf \
asm ( \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fadd z9.s, p5/m, z9.s, z18.s \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fadd z10.s, p5/m, z10.s, z19.s \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fadd z11.s, p5/m, z11.s, z20.s \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fsub z6.s, p5/m, z6.s, z21.s \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fsub z7.s, p5/m, z7.s, z22.s \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
"fsub z8.s, p5/m, z8.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZP_RECON_ACCUM
#define ZP_RECON_ACCUM_A64FXf \
asm ( \
"fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZM_RECON_ACCUM
#define ZM_RECON_ACCUM_A64FXf \
asm ( \
"fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// TP_RECON_ACCUM
#define TP_RECON_ACCUM_A64FXf \
asm ( \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fadd z6.s, p5/m, z6.s, z18.s \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fadd z7.s, p5/m, z7.s, z19.s \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fadd z8.s, p5/m, z8.s, z20.s \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fadd z9.s, p5/m, z9.s, z21.s \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fadd z10.s, p5/m, z10.s, z22.s \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
"fadd z11.s, p5/m, z11.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// TM_RECON_ACCUM
#define TM_RECON_ACCUM_A64FXf \
asm ( \
"fadd z0.s, p5/m, z0.s, z18.s \n\t" \
"fsub z6.s, p5/m, z6.s, z18.s \n\t" \
"fadd z1.s, p5/m, z1.s, z19.s \n\t" \
"fsub z7.s, p5/m, z7.s, z19.s \n\t" \
"fadd z2.s, p5/m, z2.s, z20.s \n\t" \
"fsub z8.s, p5/m, z8.s, z20.s \n\t" \
"fadd z3.s, p5/m, z3.s, z21.s \n\t" \
"fsub z9.s, p5/m, z9.s, z21.s \n\t" \
"fadd z4.s, p5/m, z4.s, z22.s \n\t" \
"fsub z10.s, p5/m, z10.s, z22.s \n\t" \
"fadd z5.s, p5/m, z5.s, z23.s \n\t" \
"fsub z11.s, p5/m, z11.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// ZERO_PSI
#define ZERO_PSI_A64FXf \
asm ( \
"ptrue p5.s \n\t" \
"fmov z0.s , 0 \n\t" \
"fmov z1.s , 0 \n\t" \
"fmov z2.s , 0 \n\t" \
"fmov z3.s , 0 \n\t" \
"fmov z4.s , 0 \n\t" \
"fmov z5.s , 0 \n\t" \
"fmov z6.s , 0 \n\t" \
"fmov z7.s , 0 \n\t" \
"fmov z8.s , 0 \n\t" \
"fmov z9.s , 0 \n\t" \
"fmov z10.s , 0 \n\t" \
"fmov z11.s , 0 \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \
{ \
asm ( \
"prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \
{ \
asm ( \
"prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
"prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
"prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
: \
: [fetchptr] "r" (base) \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
); \
}
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXf \
asm ( \
"fadd z0.s, p5/m, z0.s, z12.s \n\t" \
"fadd z1.s, p5/m, z1.s, z13.s \n\t" \
"fadd z2.s, p5/m, z2.s, z14.s \n\t" \
"fadd z3.s, p5/m, z3.s, z15.s \n\t" \
"fadd z4.s, p5/m, z4.s, z16.s \n\t" \
"fadd z5.s, p5/m, z5.s, z17.s \n\t" \
"fadd z6.s, p5/m, z6.s, z18.s \n\t" \
"fadd z7.s, p5/m, z7.s, z19.s \n\t" \
"fadd z8.s, p5/m, z8.s, z20.s \n\t" \
"fadd z9.s, p5/m, z9.s, z21.s \n\t" \
"fadd z10.s, p5/m, z10.s, z22.s \n\t" \
"fadd z11.s, p5/m, z11.s, z23.s \n\t" \
: \
: \
: "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
);

View File

@ -1,601 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Fujitsu_A64FX_intrin_double.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)
#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)
#define PF_GAUGE(A)
#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A)
#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A)
#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
#define LOCK_GAUGE(A)
#define UNLOCK_GAUGE(A)
#define MASK_REGS DECLARATIONS_A64FXd
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
#define LOAD_CHI(base) LOAD_CHI_A64FXd(base)
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)
#define XP_PROJ XP_PROJ_A64FXd
#define YP_PROJ YP_PROJ_A64FXd
#define ZP_PROJ ZP_PROJ_A64FXd
#define TP_PROJ TP_PROJ_A64FXd
#define XM_PROJ XM_PROJ_A64FXd
#define YM_PROJ YM_PROJ_A64FXd
#define ZM_PROJ ZM_PROJ_A64FXd
#define TM_PROJ TM_PROJ_A64FXd
#define XP_RECON XP_RECON_A64FXd
#define XM_RECON XM_RECON_A64FXd
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd
#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd
#define PERMUTE_DIR0 0
#define PERMUTE_DIR1 1
#define PERMUTE_DIR2 2
#define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXd;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }
#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } }
// DECLARATIONS
#define DECLARATIONS_A64FXd \
const uint64_t lut[4][8] = { \
{4, 5, 6, 7, 0, 1, 2, 3}, \
{2, 3, 0, 1, 6, 7, 4, 5}, \
{1, 0, 3, 2, 5, 4, 7, 6}, \
{0, 1, 2, 4, 5, 6, 7, 8} };\
svfloat64_t result_00; \
svfloat64_t result_01; \
svfloat64_t result_02; \
svfloat64_t result_10; \
svfloat64_t result_11; \
svfloat64_t result_12; \
svfloat64_t result_20; \
svfloat64_t result_21; \
svfloat64_t result_22; \
svfloat64_t result_30; \
svfloat64_t result_31; \
svfloat64_t result_32; \
svfloat64_t Chi_00; \
svfloat64_t Chi_01; \
svfloat64_t Chi_02; \
svfloat64_t Chi_10; \
svfloat64_t Chi_11; \
svfloat64_t Chi_12; \
svfloat64_t UChi_00; \
svfloat64_t UChi_01; \
svfloat64_t UChi_02; \
svfloat64_t UChi_10; \
svfloat64_t UChi_11; \
svfloat64_t UChi_12; \
svfloat64_t U_00; \
svfloat64_t U_10; \
svfloat64_t U_20; \
svfloat64_t U_01; \
svfloat64_t U_11; \
svfloat64_t U_21; \
svbool_t pg1; \
pg1 = svptrue_b64(); \
svuint64_t table0; \
svfloat64_t zero0; \
zero0 = svdup_f64(0.);
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_12
// RESULT
#define RESULT_A64FXd(base) \
{ \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \
svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \
}
// PREFETCH_CHIMU_L2 (prefetch to L2)
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
}
// PREFETCH_CHIMU_L1 (prefetch to L1)
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
}
// PREFETCH_GAUGE_L2 (prefetch to L2)
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \
{ \
const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
}
// PREFETCH_GAUGE_L1 (prefetch to L1)
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
}
// LOAD_CHI
#define LOAD_CHI_A64FXd(base) \
{ \
Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64)); \
Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64)); \
Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64)); \
Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64)); \
Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64)); \
Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64)); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \
{ \
Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
}
// LOAD_CHIMU_0213
#define LOAD_CHIMU_0213_A64FXd \
{ \
const SiteSpinor & ref(in[offset]); \
Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \
}
// LOAD_CHIMU_0312
#define LOAD_CHIMU_0312_A64FXd \
{ \
const SiteSpinor & ref(in[offset]); \
Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \
}
// LOAD_TABLE0
#define LOAD_TABLE0 \
table0 = svld1(pg1, (uint64_t*)&lut[0]);
// LOAD_TABLE1
#define LOAD_TABLE1 \
table0 = svld1(pg1, (uint64_t*)&lut[1]);
// LOAD_TABLE2
#define LOAD_TABLE2 \
table0 = svld1(pg1, (uint64_t*)&lut[2]);
// LOAD_TABLE3
#define LOAD_TABLE3 \
table0 = svld1(pg1, (uint64_t*)&lut[3]);
// PERMUTE
#define PERMUTE_A64FXd \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// LOAD_GAUGE
#define LOAD_GAUGE \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
{ \
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
}
// MULT_2SPIN
#define MULT_2SPIN_1_A64FXd(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
UChi_11 = svcmla_x(pg1, zero0, U_10, Chi_10, 0); \
UChi_02 = svcmla_x(pg1, zero0, U_20, Chi_00, 0); \
UChi_12 = svcmla_x(pg1, zero0, U_20, Chi_10, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXd \
{ \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \
UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \
}
// XP_PROJ
#define XP_PROJ_A64FXd \
{ \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \
}
// XP_RECON
#define XP_RECON_A64FXd \
result_20 = svcadd_x(pg1, zero0, UChi_10, 270); \
result_21 = svcadd_x(pg1, zero0, UChi_11, 270); \
result_22 = svcadd_x(pg1, zero0, UChi_12, 270); \
result_30 = svcadd_x(pg1, zero0, UChi_00, 270); \
result_31 = svcadd_x(pg1, zero0, UChi_01, 270); \
result_32 = svcadd_x(pg1, zero0, UChi_02, 270); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
// XP_RECON_ACCUM
#define XP_RECON_ACCUM_A64FXd \
result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// YP_PROJ
#define YP_PROJ_A64FXd \
{ \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \
Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \
Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \
Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \
}
// ZP_PROJ
#define ZP_PROJ_A64FXd \
{ \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \
}
// TP_PROJ
#define TP_PROJ_A64FXd \
{ \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \
Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \
Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \
Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \
}
// XM_PROJ
#define XM_PROJ_A64FXd \
{ \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \
}
// XM_RECON
#define XM_RECON_A64FXd \
result_20 = svcadd_x(pg1, zero0, UChi_10, 90); \
result_21 = svcadd_x(pg1, zero0, UChi_11, 90); \
result_22 = svcadd_x(pg1, zero0, UChi_12, 90); \
result_30 = svcadd_x(pg1, zero0, UChi_00, 90); \
result_31 = svcadd_x(pg1, zero0, UChi_01, 90); \
result_32 = svcadd_x(pg1, zero0, UChi_02, 90); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
// YM_PROJ
#define YM_PROJ_A64FXd \
{ \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \
Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \
Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \
Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \
}
// ZM_PROJ
#define ZM_PROJ_A64FXd \
{ \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \
}
// TM_PROJ
#define TM_PROJ_A64FXd \
{ \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \
Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \
Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \
Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \
}
// XM_RECON_ACCUM
#define XM_RECON_ACCUM_A64FXd \
result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \
result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXd \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_30 = svsub_x(pg1, result_30, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_31 = svsub_x(pg1, result_31, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_32 = svsub_x(pg1, result_32, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_20 = svadd_x(pg1, result_20, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_21 = svadd_x(pg1, result_21, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_22 = svadd_x(pg1, result_22, UChi_12);
// YM_RECON_ACCUM
#define YM_RECON_ACCUM_A64FXd \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_30 = svadd_x(pg1, result_30, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_31 = svadd_x(pg1, result_31, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_32 = svadd_x(pg1, result_32, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_20 = svsub_x(pg1, result_20, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_21 = svsub_x(pg1, result_21, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_22 = svsub_x(pg1, result_22, UChi_12);
// ZP_RECON_ACCUM
#define ZP_RECON_ACCUM_A64FXd \
result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// ZM_RECON_ACCUM
#define ZM_RECON_ACCUM_A64FXd \
result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// TP_RECON_ACCUM
#define TP_RECON_ACCUM_A64FXd \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_20 = svadd_x(pg1, result_20, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_21 = svadd_x(pg1, result_21, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_22 = svadd_x(pg1, result_22, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_30 = svadd_x(pg1, result_30, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_31 = svadd_x(pg1, result_31, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_32 = svadd_x(pg1, result_32, UChi_12);
// TM_RECON_ACCUM
#define TM_RECON_ACCUM_A64FXd \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_20 = svsub_x(pg1, result_20, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_21 = svsub_x(pg1, result_21, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_22 = svsub_x(pg1, result_22, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_30 = svsub_x(pg1, result_30, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_31 = svsub_x(pg1, result_31, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_32 = svsub_x(pg1, result_32, UChi_12);
// ZERO_PSI
#define ZERO_PSI_A64FXd \
result_00 = svdup_f64(0.); \
result_01 = svdup_f64(0.); \
result_02 = svdup_f64(0.); \
result_10 = svdup_f64(0.); \
result_11 = svdup_f64(0.); \
result_12 = svdup_f64(0.); \
result_20 = svdup_f64(0.); \
result_21 = svdup_f64(0.); \
result_22 = svdup_f64(0.); \
result_30 = svdup_f64(0.); \
result_31 = svdup_f64(0.); \
result_32 = svdup_f64(0.);
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
}
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PSTL1STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PSTL1STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PSTL1STRM); \
}
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXd \
result_00 = svadd_x(pg1, result_00, Chimu_00); \
result_01 = svadd_x(pg1, result_01, Chimu_01); \
result_02 = svadd_x(pg1, result_02, Chimu_02); \
result_10 = svadd_x(pg1, result_10, Chimu_10); \
result_11 = svadd_x(pg1, result_11, Chimu_11); \
result_12 = svadd_x(pg1, result_12, Chimu_12); \
result_20 = svadd_x(pg1, result_20, Chimu_20); \
result_21 = svadd_x(pg1, result_21, Chimu_21); \
result_22 = svadd_x(pg1, result_22, Chimu_22); \
result_30 = svadd_x(pg1, result_30, Chimu_30); \
result_31 = svadd_x(pg1, result_31, Chimu_31); \
result_32 = svadd_x(pg1, result_32, Chimu_32);

View File

@ -1,601 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Fujitsu_A64FX_intrin_single.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base)
#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)
#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)
#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)
#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)
#define PF_GAUGE(A)
#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A)
#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A)
#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)
#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)
#define LOCK_GAUGE(A)
#define UNLOCK_GAUGE(A)
#define MASK_REGS DECLARATIONS_A64FXf
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
#define LOAD_CHI(base) LOAD_CHI_A64FXf(base)
#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)
#define XP_PROJ XP_PROJ_A64FXf
#define YP_PROJ YP_PROJ_A64FXf
#define ZP_PROJ ZP_PROJ_A64FXf
#define TP_PROJ TP_PROJ_A64FXf
#define XM_PROJ XM_PROJ_A64FXf
#define YM_PROJ YM_PROJ_A64FXf
#define ZM_PROJ ZM_PROJ_A64FXf
#define TM_PROJ TM_PROJ_A64FXf
#define XP_RECON XP_RECON_A64FXf
#define XM_RECON XM_RECON_A64FXf
#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf
#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf
#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf
#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf
#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf
#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf
#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf
#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf
#define PERMUTE_DIR0 0
#define PERMUTE_DIR1 1
#define PERMUTE_DIR2 2
#define PERMUTE_DIR3 3
#define PERMUTE PERMUTE_A64FXf;
#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }
#define MAYBEPERM(A,perm) if (perm) { PERMUTE; }
// DECLARATIONS
#define DECLARATIONS_A64FXf \
const uint32_t lut[4][16] = { \
{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
{2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \
svfloat32_t result_00; \
svfloat32_t result_01; \
svfloat32_t result_02; \
svfloat32_t result_10; \
svfloat32_t result_11; \
svfloat32_t result_12; \
svfloat32_t result_20; \
svfloat32_t result_21; \
svfloat32_t result_22; \
svfloat32_t result_30; \
svfloat32_t result_31; \
svfloat32_t result_32; \
svfloat32_t Chi_00; \
svfloat32_t Chi_01; \
svfloat32_t Chi_02; \
svfloat32_t Chi_10; \
svfloat32_t Chi_11; \
svfloat32_t Chi_12; \
svfloat32_t UChi_00; \
svfloat32_t UChi_01; \
svfloat32_t UChi_02; \
svfloat32_t UChi_10; \
svfloat32_t UChi_11; \
svfloat32_t UChi_12; \
svfloat32_t U_00; \
svfloat32_t U_10; \
svfloat32_t U_20; \
svfloat32_t U_01; \
svfloat32_t U_11; \
svfloat32_t U_21; \
svbool_t pg1; \
pg1 = svptrue_b32(); \
svuint32_t table0; \
svfloat32_t zero0; \
zero0 = svdup_f32(0.);
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_12
// RESULT
#define RESULT_A64FXf(base) \
{ \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \
svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \
}
// PREFETCH_CHIMU_L2 (prefetch to L2)
#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
}
// PREFETCH_CHIMU_L1 (prefetch to L1)
#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
}
// PREFETCH_GAUGE_L2 (prefetch to L2)
#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \
{ \
const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
}
// PREFETCH_GAUGE_L1 (prefetch to L1)
#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
}
// LOAD_CHI
#define LOAD_CHI_A64FXf(base) \
{ \
Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64)); \
Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64)); \
Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64)); \
Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64)); \
Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64)); \
Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64)); \
}
// LOAD_CHIMU
#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \
{ \
Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
}
// LOAD_CHIMU_0213
#define LOAD_CHIMU_0213_A64FXf \
{ \
const SiteSpinor & ref(in[offset]); \
Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \
}
// LOAD_CHIMU_0312
#define LOAD_CHIMU_0312_A64FXf \
{ \
const SiteSpinor & ref(in[offset]); \
Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \
Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \
Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \
Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \
Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \
Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \
Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \
Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \
Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \
Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \
Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \
Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \
}
// LOAD_TABLE0
#define LOAD_TABLE0 \
table0 = svld1(pg1, (uint32_t*)&lut[0]);
// LOAD_TABLE1
#define LOAD_TABLE1 \
table0 = svld1(pg1, (uint32_t*)&lut[1]);
// LOAD_TABLE2
#define LOAD_TABLE2 \
table0 = svld1(pg1, (uint32_t*)&lut[2]);
// LOAD_TABLE3
#define LOAD_TABLE3 \
table0 = svld1(pg1, (uint32_t*)&lut[3]);
// PERMUTE
#define PERMUTE_A64FXf \
Chi_00 = svtbl(Chi_00, table0); \
Chi_01 = svtbl(Chi_01, table0); \
Chi_02 = svtbl(Chi_02, table0); \
Chi_10 = svtbl(Chi_10, table0); \
Chi_11 = svtbl(Chi_11, table0); \
Chi_12 = svtbl(Chi_12, table0);
// LOAD_GAUGE
#define LOAD_GAUGE \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
{ \
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
}
// MULT_2SPIN
#define MULT_2SPIN_1_A64FXf(A) \
{ \
const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \
U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \
U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \
U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \
UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
UChi_11 = svcmla_x(pg1, zero0, U_10, Chi_10, 0); \
UChi_02 = svcmla_x(pg1, zero0, U_20, Chi_00, 0); \
UChi_12 = svcmla_x(pg1, zero0, U_20, Chi_10, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \
U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \
U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \
}
// MULT_2SPIN_BACKEND
#define MULT_2SPIN_2_A64FXf \
{ \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \
UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \
UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \
UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \
UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \
UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \
UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \
UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \
}
// XP_PROJ
#define XP_PROJ_A64FXf \
{ \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \
}
// XP_RECON
#define XP_RECON_A64FXf \
result_20 = svcadd_x(pg1, zero0, UChi_10, 270); \
result_21 = svcadd_x(pg1, zero0, UChi_11, 270); \
result_22 = svcadd_x(pg1, zero0, UChi_12, 270); \
result_30 = svcadd_x(pg1, zero0, UChi_00, 270); \
result_31 = svcadd_x(pg1, zero0, UChi_01, 270); \
result_32 = svcadd_x(pg1, zero0, UChi_02, 270); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
// XP_RECON_ACCUM
#define XP_RECON_ACCUM_A64FXf \
result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// YP_PROJ
#define YP_PROJ_A64FXf \
{ \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \
Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \
Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \
Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \
}
// ZP_PROJ
#define ZP_PROJ_A64FXf \
{ \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \
}
// TP_PROJ
#define TP_PROJ_A64FXf \
{ \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \
Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \
Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \
Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \
}
// XM_PROJ
#define XM_PROJ_A64FXf \
{ \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \
}
// XM_RECON
#define XM_RECON_A64FXf \
result_20 = svcadd_x(pg1, zero0, UChi_10, 90); \
result_21 = svcadd_x(pg1, zero0, UChi_11, 90); \
result_22 = svcadd_x(pg1, zero0, UChi_12, 90); \
result_30 = svcadd_x(pg1, zero0, UChi_00, 90); \
result_31 = svcadd_x(pg1, zero0, UChi_01, 90); \
result_32 = svcadd_x(pg1, zero0, UChi_02, 90); \
result_00 = UChi_00; \
result_01 = UChi_01; \
result_02 = UChi_02; \
result_10 = UChi_10; \
result_11 = UChi_11; \
result_12 = UChi_12;
// YM_PROJ
#define YM_PROJ_A64FXf \
{ \
Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \
Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \
Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \
Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \
Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \
Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \
}
// ZM_PROJ
#define ZM_PROJ_A64FXf \
{ \
Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \
Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \
Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \
Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \
Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \
Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \
}
// TM_PROJ
#define TM_PROJ_A64FXf \
{ \
Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \
Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \
Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \
Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \
Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \
Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \
}
// XM_RECON_ACCUM
#define XM_RECON_ACCUM_A64FXf \
result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \
result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \
result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \
result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \
result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \
result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// YP_RECON_ACCUM
#define YP_RECON_ACCUM_A64FXf \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_30 = svsub_x(pg1, result_30, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_31 = svsub_x(pg1, result_31, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_32 = svsub_x(pg1, result_32, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_20 = svadd_x(pg1, result_20, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_21 = svadd_x(pg1, result_21, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_22 = svadd_x(pg1, result_22, UChi_12);
// YM_RECON_ACCUM
#define YM_RECON_ACCUM_A64FXf \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_30 = svadd_x(pg1, result_30, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_31 = svadd_x(pg1, result_31, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_32 = svadd_x(pg1, result_32, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_20 = svsub_x(pg1, result_20, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_21 = svsub_x(pg1, result_21, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_22 = svsub_x(pg1, result_22, UChi_12);
// ZP_RECON_ACCUM
#define ZP_RECON_ACCUM_A64FXf \
result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// ZM_RECON_ACCUM
#define ZM_RECON_ACCUM_A64FXf \
result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \
result_12 = svadd_x(pg1, result_12, UChi_12);
// TP_RECON_ACCUM
#define TP_RECON_ACCUM_A64FXf \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_20 = svadd_x(pg1, result_20, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_21 = svadd_x(pg1, result_21, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_22 = svadd_x(pg1, result_22, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_30 = svadd_x(pg1, result_30, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_31 = svadd_x(pg1, result_31, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_32 = svadd_x(pg1, result_32, UChi_12);
// TM_RECON_ACCUM
#define TM_RECON_ACCUM_A64FXf \
result_00 = svadd_x(pg1, result_00, UChi_00); \
result_20 = svsub_x(pg1, result_20, UChi_00); \
result_01 = svadd_x(pg1, result_01, UChi_01); \
result_21 = svsub_x(pg1, result_21, UChi_01); \
result_02 = svadd_x(pg1, result_02, UChi_02); \
result_22 = svsub_x(pg1, result_22, UChi_02); \
result_10 = svadd_x(pg1, result_10, UChi_10); \
result_30 = svsub_x(pg1, result_30, UChi_10); \
result_11 = svadd_x(pg1, result_11, UChi_11); \
result_31 = svsub_x(pg1, result_31, UChi_11); \
result_12 = svadd_x(pg1, result_12, UChi_12); \
result_32 = svsub_x(pg1, result_32, UChi_12);
// ZERO_PSI
#define ZERO_PSI_A64FXf \
result_00 = svdup_f32(0.); \
result_01 = svdup_f32(0.); \
result_02 = svdup_f32(0.); \
result_10 = svdup_f32(0.); \
result_11 = svdup_f32(0.); \
result_12 = svdup_f32(0.); \
result_20 = svdup_f32(0.); \
result_21 = svdup_f32(0.); \
result_22 = svdup_f32(0.); \
result_30 = svdup_f32(0.); \
result_31 = svdup_f32(0.); \
result_32 = svdup_f32(0.);
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
}
// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \
{ \
svprfd(pg1, (int64_t*)(base + 0), SV_PSTL1STRM); \
svprfd(pg1, (int64_t*)(base + 256), SV_PSTL1STRM); \
svprfd(pg1, (int64_t*)(base + 512), SV_PSTL1STRM); \
}
// ADD_RESULT_INTERNAL
#define ADD_RESULT_INTERNAL_A64FXf \
result_00 = svadd_x(pg1, result_00, Chimu_00); \
result_01 = svadd_x(pg1, result_01, Chimu_01); \
result_02 = svadd_x(pg1, result_02, Chimu_02); \
result_10 = svadd_x(pg1, result_10, Chimu_10); \
result_11 = svadd_x(pg1, result_11, Chimu_11); \
result_12 = svadd_x(pg1, result_12, Chimu_12); \
result_20 = svadd_x(pg1, result_20, Chimu_20); \
result_21 = svadd_x(pg1, result_21, Chimu_21); \
result_22 = svadd_x(pg1, result_22, Chimu_22); \
result_30 = svadd_x(pg1, result_30, Chimu_30); \
result_31 = svadd_x(pg1, result_31, Chimu_31); \
result_32 = svadd_x(pg1, result_32, Chimu_32);

View File

@ -1,76 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Fujitsu_A64FX_undef.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#undef LOAD_CHIMU
#undef PREFETCH_CHIMU_L1
#undef PREFETCH_GAUGE_L1
#undef PREFETCH_CHIMU_L2
#undef PREFETCH_GAUGE_L2
#undef PREFETCH_GAUGE_L1_INTERNAL
#undef PREFETCH1_CHIMU
#undef PREFETCH_CHIMU
#undef PREFETCH_RESULT_L2_STORE
#undef PREFETCH_RESULT_L1_STORE
#undef LOAD_GAUGE
#undef LOCK_GAUGE
#undef UNLOCK_GAUGE
#undef MASK_REGS
#undef SAVE_RESULT
#undef ADD_RESULT
#undef MULT_2SPIN_1
#undef MULT_2SPIN_2
#undef MAYBEPERM
#undef LOAD_CHI
#undef XP_PROJ
#undef YP_PROJ
#undef ZP_PROJ
#undef TP_PROJ
#undef XM_PROJ
#undef YM_PROJ
#undef ZM_PROJ
#undef TM_PROJ
#undef XP_RECON
#undef XM_RECON
#undef XM_RECON_ACCUM
#undef YM_RECON_ACCUM
#undef ZM_RECON_ACCUM
#undef TM_RECON_ACCUM
#undef XP_RECON_ACCUM
#undef YP_RECON_ACCUM
#undef ZP_RECON_ACCUM
#undef TP_RECON_ACCUM
#undef PERMUTE
#undef PERMUTE_DIR0
#undef PERMUTE_DIR1
#undef PERMUTE_DIR2
#undef PERMUTE_DIR3
#undef LOAD_TABLE
#undef LOAD_TABLE0
#undef LOAD_TABLE1
#undef LOAD_TABLE2
#undef LOAD_TABLE3

View File

@ -1,942 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Grid_a64fx-2.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de>
with support from Arm
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
/////////////////////////////////////////////////////
// Using SVE ACLE
/////////////////////////////////////////////////////
static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes");
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Optimization);
// type traits giving the number of elements for each vector type
template <typename T> struct W;
template <> struct W<double> {
constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
};
template <> struct W<float> {
constexpr static unsigned int c = GEN_SIMD_WIDTH/8u;
constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
};
template <> struct W<Integer> {
constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
};
template <> struct W<uint16_t> {
constexpr static unsigned int c = GEN_SIMD_WIDTH/4u;
constexpr static unsigned int r = GEN_SIMD_WIDTH/2u;
};
template <> struct W<uint64_t> {
constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
};
#ifdef ARMCLANGCOMPAT
// SIMD vector immediate types
template <typename T>
struct vec_imm {
alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
};
// SIMD vector types
template <typename T>
struct vec {
alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
vec() = default;
vec(const vec &rhs) { this->operator=(rhs); }
vec(const vec_imm<T> &rhs) {
// v = rhs.v
svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v));
}
inline vec &operator=(const vec &rhs) {
// v = rhs.v
svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v));
return *this;
};
};
#else // no ARMCLANGCOMPAT
#define vec_imm vec
// SIMD vector types
template <typename T>
struct vec {
alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
};
#endif
typedef vec<float> vecf;
typedef vec<double> vecd;
typedef vec<uint16_t> vech; // half precision comms
typedef vec<Integer> veci;
NAMESPACE_END(Optimization)
NAMESPACE_END(Grid)
// low-level API
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Optimization);
template <typename T>
struct acle{};
template <>
struct acle<double>{
typedef svfloat64_t vt;
typedef svfloat64x2_t vt2;
typedef svfloat64x4_t vt4;
typedef float64_t pt;
typedef uint64_t uint;
typedef svuint64_t svuint;
static inline svbool_t pg1(){return svptrue_b64();}
static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);}
static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);}
static inline vec<uint64_t> tbl_swap(){
//const vec<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
const vec_imm<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
return t;
}
static inline vec<uint64_t> tbl0(){
//const vec<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
const vec_imm<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
return t;
}
static inline vec<uint64_t> tbl1(){
//const vec<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
const vec_imm<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
return t;
}
static inline vec<uint64_t> tbl_exch1a(){ // Exchange1
//const vec<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
const vec_imm<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
return t;
}
static inline vec<uint64_t> tbl_exch1b(){ // Exchange1
//const vec<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
const vec_imm<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
return t;
}
static inline vec<uint64_t> tbl_exch1c(){ // Exchange1
//const vec<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
const vec_imm<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
return t;
}
static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());}
static inline svbool_t pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());}
static inline svfloat64_t zero(){return svdup_f64(0.);}
};
template <>
struct acle<float>{
typedef svfloat32_t vt;
typedef svfloat32x2_t vt2;
typedef float32_t pt;
typedef uint32_t uint;
typedef svuint32_t svuint;
static inline svbool_t pg1(){return svptrue_b32();}
static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);}
// exchange neighboring elements
static inline vec<uint32_t> tbl_swap(){
//const vec<uint32_t> t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
const vec_imm<uint32_t> t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
return t;
}
static inline vec<uint32_t> tbl0(){
//const vec<uint32_t> t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
const vec_imm<uint32_t> t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
return t;
}
static inline vec<uint32_t> tbl1(){
//const vec<uint32_t> t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
const vec_imm<uint32_t> t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
return t;
}
static inline vec<uint32_t> tbl2(){
//const vec<uint32_t> t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
const vec_imm<uint32_t> t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
return t;
}
static inline vec<uint32_t> tbl_exch1a(){ // Exchange1
//const vec<uint32_t> t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 };
const vec_imm<uint32_t> t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 };
return t;
}
static inline vec<uint32_t> tbl_exch1b(){ // Exchange1
//const vec<uint32_t> t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 };
const vec_imm<uint32_t> t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 };
return t;
}
static inline vec<uint32_t> tbl_exch1c(){ // Exchange1
//const vec<uint32_t> t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7};
const vec_imm<uint32_t> t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7};
return t;
}
static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());}
static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());}
static inline svfloat32_t zero(){return svdup_f32(0.);}
};
template <>
struct acle<uint16_t>{
typedef svfloat16_t vt;
typedef float16_t pt;
typedef uint16_t uint;
typedef svuint16_t svuint;
static inline svbool_t pg1(){return svptrue_b16();}
static inline svbool_t pg2(){return svptrue_pat_b16(SV_VL16);}
static inline svbool_t pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());}
static inline svbool_t pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());}
static inline svfloat16_t zero(){return svdup_f16(0.);}
};
template <>
struct acle<Integer>{
typedef svuint32_t vt;
typedef svuint32x2_t vt2;
typedef Integer pt;
typedef uint32_t uint;
typedef svuint32_t svuint;
//static inline svbool_t pg1(){return svptrue_b16();}
static inline svbool_t pg1(){return svptrue_b32();}
static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);}
static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());}
static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());}
};
// ---------------------------------------------------
struct Vsplat{
// Complex float
inline vecf operator()(float a, float b){
vecf out;
svbool_t pg1 = acle<float>::pg1();
typename acle<float>::vt a_v = svdup_f32(a);
typename acle<float>::vt b_v = svdup_f32(b);
typename acle<float>::vt r_v = svzip1(a_v, b_v);
svst1(pg1, out.v, r_v);
return out;
}
// Real float
inline vecf operator()(float a){
vecf out;
svbool_t pg1 = acle<float>::pg1();
typename acle<float>::vt r_v = svdup_f32(a);
svst1(pg1, out.v, r_v);
return out;
}
// Complex double
inline vecd operator()(double a, double b){
vecd out;
svbool_t pg1 = acle<double>::pg1();
typename acle<double>::vt a_v = svdup_f64(a);
typename acle<double>::vt b_v = svdup_f64(b);
typename acle<double>::vt r_v = svzip1(a_v, b_v);
svst1(pg1, out.v, r_v);
return out;
}
// Real double
inline vecd operator()(double a){
vecd out;
svbool_t pg1 = acle<double>::pg1();
typename acle<double>::vt r_v = svdup_f64(a);
svst1(pg1, out.v, r_v);
return out;
}
// Integer
inline vec<Integer> operator()(Integer a){
vec<Integer> out;
svbool_t pg1 = acle<Integer>::pg1();
// Add check whether Integer is really a uint32_t???
typename acle<Integer>::vt r_v = svdup_u32(a);
svst1(pg1, out.v, r_v);
return out;
}
};
struct Vstore{
// Real
template <typename T>
inline void operator()(vec<T> a, T *D){
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, (typename acle<T>::pt*)&a.v);
svst1(pg1, D, a_v);
}
};
struct Vstream{
// Real
template <typename T>
inline void operator()(T * a, vec<T> b){
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt b_v = svld1(pg1, b.v);
svstnt1(pg1, a, b_v);
//svst1(pg1, a, b_v);
}
};
struct Vset{
// Complex
template <typename T>
inline vec<T> operator()(std::complex<T> *a){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, (T*)a);
svst1(pg1, out.v, a_v);
return out;
}
// Real
template <typename T>
inline vec<T> operator()(T *a){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, a);
svst1(pg1, out.v, a_v);
return out;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, a.v);
typename acle<T>::vt b_v = svld1(pg1, b.v);
typename acle<T>::vt r_v = svadd_x(pg1, a_v, b_v);
svst1(pg1, out.v, r_v);
return out;
}
};
struct Sub{
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, a.v);
typename acle<T>::vt b_v = svld1(pg1, b.v);
typename acle<T>::vt r_v = svsub_x(pg1, a_v, b_v);
svst1(pg1, out.v, r_v);
return out;
}
};
struct Mult{
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b, vec<T> c){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, a.v);
typename acle<T>::vt b_v = svld1(pg1, b.v);
typename acle<T>::vt c_v = svld1(pg1, c.v);
typename acle<T>::vt r_v = svmla_x(pg1, c_v, a_v, b_v);
svst1(pg1, out.v, r_v);
return out;
}
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, a.v);
typename acle<T>::vt b_v = svld1(pg1, b.v);
typename acle<T>::vt r_v = svmul_x(pg1, a_v, b_v);
svst1(pg1, out.v, r_v);
return out;
}
};
struct MultRealPart{
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, a.v);
typename acle<T>::vt b_v = svld1(pg1, b.v);
// using FCMLA
typename acle<T>::vt z_v = acle<T>::zero();
typename acle<T>::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0);
svst1(pg1, out.v, r_v);
return out;
}
};
struct MaddRealPart{
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b, vec<T> c){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, a.v);
typename acle<T>::vt b_v = svld1(pg1, b.v);
typename acle<T>::vt c_v = svld1(pg1, c.v);
// using FCMLA
typename acle<T>::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0);
svst1(pg1, out.v, r_v);
return out;
}
};
struct MultComplex{
// Complex a*b
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, a.v);
typename acle<T>::vt b_v = svld1(pg1, b.v);
typename acle<T>::vt z_v = acle<T>::zero();
// using FCMLA
typename acle<T>::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0);
r_v = svcmla_x(pg1, r_v, a_v, b_v, 90);
svst1(pg1, out.v, r_v);
return out;
}
};
struct MultAddComplex{
// Complex a*b+c
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b, vec<T> c){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, a.v);
typename acle<T>::vt b_v = svld1(pg1, b.v);
typename acle<T>::vt c_v = svld1(pg1, c.v);;
// using FCMLA
typename acle<T>::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0);
r_v = svcmla_x(pg1, r_v, a_v, b_v, 90);
svst1(pg1, out.v, r_v);
return out;
}
};
struct Div{
// Real
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, a.v);
typename acle<T>::vt b_v = svld1(pg1, b.v);
typename acle<T>::vt r_v = svdiv_x(pg1, a_v, b_v);
svst1(pg1, out.v, r_v);
return out;
}
};
struct Conj{
// Complex
template <typename T>
inline vec<T> operator()(vec<T> a){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
svbool_t pg_odd = acle<T>::pg_odd();
typename acle<T>::vt a_v = svld1(pg1, a.v);
//typename acle<T>::vt r_v = svneg_x(pg_odd, a_v);
typename acle<T>::vt r_v = svneg_m(a_v, pg_odd, a_v);
svst1(pg1, out.v, r_v);
return out;
}
};
struct TimesMinusI{
// Complex
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap();
svbool_t pg1 = acle<T>::pg1();
svbool_t pg_odd = acle<T>::pg_odd();
typename acle<T>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
typename acle<T>::vt a_v = svld1(pg1, a.v);
a_v = svtbl(a_v, tbl_swap_v);
typename acle<T>::vt r_v = svneg_m(a_v, pg_odd, a_v);
svst1(pg1, out.v, r_v);
return out;
}
};
struct TimesI{
// Complex
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap();
svbool_t pg1 = acle<T>::pg1();
svbool_t pg_even = acle<T>::pg_even();
typename acle<T>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
typename acle<T>::vt a_v = svld1(pg1, a.v);
a_v = svtbl(a_v, tbl_swap_v);
//typename acle<T>::vt r_v = svneg_x(pg_even, a_v);
typename acle<T>::vt r_v = svneg_m(a_v, pg_even, a_v);
svst1(pg1, out.v, r_v);
return out;
}
};
struct PrecisionChange {
static inline vech StoH (const vecf &sa,const vecf &sb) {
vech ret;
svbool_t pg1s = acle<float>::pg1();
svbool_t pg1h = acle<uint16_t>::pg1();
typename acle<float>::vt sa_v = svld1(pg1s, sa.v);
typename acle<float>::vt sb_v = svld1(pg1s, sb.v);
typename acle<uint16_t>::vt ha_v = svcvt_f16_x(pg1s, sa_v);
typename acle<uint16_t>::vt hb_v = svcvt_f16_x(pg1s, sb_v);
typename acle<uint16_t>::vt r_v = svuzp1(ha_v, hb_v);
svst1(pg1h, (typename acle<uint16_t>::pt*)&ret.v, r_v);
return ret;
}
static inline void HtoS(vech h,vecf &sa,vecf &sb) {
svbool_t pg1h = acle<uint16_t>::pg1();
svbool_t pg1s = acle<float>::pg1();
typename acle<uint16_t>::vt h_v = svld1(pg1h, (typename acle<uint16_t>::pt*)&h.v);
typename acle<uint16_t>::vt ha_v = svzip1(h_v, h_v);
typename acle<uint16_t>::vt hb_v = svzip2(h_v, h_v);
typename acle<float>::vt sa_v = svcvt_f32_x(pg1s, ha_v);
typename acle<float>::vt sb_v = svcvt_f32_x(pg1s, hb_v);
svst1(pg1s, sa.v, sa_v);
svst1(pg1s, sb.v, sb_v);
}
static inline vecf DtoS (vecd a,vecd b) {
vecf ret;
svbool_t pg1d = acle<double>::pg1();
svbool_t pg1s = acle<float>::pg1();
typename acle<double>::vt a_v = svld1(pg1d, a.v);
typename acle<double>::vt b_v = svld1(pg1d, b.v);
typename acle<float>::vt sa_v = svcvt_f32_x(pg1d, a_v);
typename acle<float>::vt sb_v = svcvt_f32_x(pg1d, b_v);
typename acle<float>::vt r_v = svuzp1(sa_v, sb_v);
svst1(pg1s, ret.v, r_v);
return ret;
}
static inline void StoD (vecf s,vecd &a,vecd &b) {
svbool_t pg1s = acle<float>::pg1();
svbool_t pg1d = acle<double>::pg1();
typename acle<float>::vt s_v = svld1(pg1s, s.v);
typename acle<float>::vt sa_v = svzip1(s_v, s_v);
typename acle<float>::vt sb_v = svzip2(s_v, s_v);
typename acle<double>::vt a_v = svcvt_f64_x(pg1d, sa_v);
typename acle<double>::vt b_v = svcvt_f64_x(pg1d, sb_v);
svst1(pg1d, a.v, a_v);
svst1(pg1d, b.v, b_v);
}
static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) {
vech ret;
svbool_t pg1d = acle<double>::pg1();
svbool_t pg1h = acle<uint16_t>::pg1();
typename acle<double>::vt a_v = svld1(pg1d, a.v);
typename acle<double>::vt b_v = svld1(pg1d, b.v);
typename acle<double>::vt c_v = svld1(pg1d, c.v);
typename acle<double>::vt d_v = svld1(pg1d, d.v);
typename acle<uint16_t>::vt ha_v = svcvt_f16_x(pg1d, a_v);
typename acle<uint16_t>::vt hb_v = svcvt_f16_x(pg1d, b_v);
typename acle<uint16_t>::vt hc_v = svcvt_f16_x(pg1d, c_v);
typename acle<uint16_t>::vt hd_v = svcvt_f16_x(pg1d, d_v);
typename acle<uint16_t>::vt hab_v = svuzp1(ha_v, hb_v);
typename acle<uint16_t>::vt hcd_v = svuzp1(hc_v, hd_v);
typename acle<uint16_t>::vt r_v = svuzp1(hab_v, hcd_v);
svst1(pg1h, (typename acle<uint16_t>::pt*)&ret.v, r_v);
return ret;
/*
vecf sa,sb;
sa = DtoS(a,b);
sb = DtoS(c,d);
return StoH(sa,sb);
*/
}
static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) {
svbool_t pg1h = acle<uint16_t>::pg1();
svbool_t pg1d = acle<double>::pg1();
typename acle<uint16_t>::vt h_v = svld1(pg1h, (typename acle<uint16_t>::pt*)&h.v);
typename acle<uint16_t>::vt sa_v = svzip1(h_v, h_v);
typename acle<uint16_t>::vt sb_v = svzip2(h_v, h_v);
typename acle<uint16_t>::vt da_v = svzip1(sa_v, sa_v);
typename acle<uint16_t>::vt db_v = svzip2(sa_v, sa_v);
typename acle<uint16_t>::vt dc_v = svzip1(sb_v, sb_v);
typename acle<uint16_t>::vt dd_v = svzip2(sb_v, sb_v);
typename acle<double>::vt a_v = svcvt_f64_x(pg1d, da_v);
typename acle<double>::vt b_v = svcvt_f64_x(pg1d, db_v);
typename acle<double>::vt c_v = svcvt_f64_x(pg1d, dc_v);
typename acle<double>::vt d_v = svcvt_f64_x(pg1d, dd_v);
svst1(pg1d, a.v, a_v);
svst1(pg1d, b.v, b_v);
svst1(pg1d, c.v, c_v);
svst1(pg1d, d.v, d_v);
/*
vecf sa,sb;
HtoS(h,sa,sb);
StoD(sa,a,b);
StoD(sb,c,d);
*/
}
};
struct Exchange{
// Exchange0 is valid for arbitrary SVE vector length
template <typename T>
static inline void Exchange0(vec<T> &out1, vec<T> &out2, const vec<T> &in1, const vec<T> &in2){
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a1_v = svld1(pg1, in1.v);
typename acle<T>::vt a2_v = svld1(pg1, in2.v);
typename acle<T>::vt r1_v = svext(a1_v, a1_v, (uint64_t)W<T>::c);
r1_v = svext(r1_v, a2_v, (uint64_t)W<T>::c);
typename acle<T>::vt r2_v = svext(a2_v, a2_v, (uint64_t)W<T>::c);
r2_v = svext(a1_v, r2_v, (uint64_t)W<T>::c);
svst1(pg1, out1.v, r1_v);
svst1(pg1, out2.v, r2_v);
}
template <typename T>
static inline void Exchange1(vec<T> &out1, vec<T> &out2, const vec<T> &in1, const vec<T> &in2){
// this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1
// alternative: use 4-el structure; expect translation into ldp + stp -> SFI
svbool_t pg1 = acle<T>::pg1();
const vec<typename acle<T>::uint> tbl_exch1a = acle<T>::tbl_exch1a();
const vec<typename acle<T>::uint> tbl_exch1b = acle<T>::tbl_exch1b();
const vec<typename acle<T>::uint> tbl_exch1c = acle<T>::tbl_exch1c();
typename acle<T>::svuint tbl_exch1a_v = svld1(pg1, tbl_exch1a.v);
typename acle<T>::svuint tbl_exch1b_v = svld1(pg1, tbl_exch1b.v);
typename acle<T>::svuint tbl_exch1c_v = svld1(pg1, tbl_exch1c.v);
typename acle<T>::vt in1_v = svld1(pg1, in1.v);
typename acle<T>::vt in2_v = svld1(pg1, in2.v);
typename acle<T>::vt a1_v = svtbl(in1_v, tbl_exch1a_v);
typename acle<T>::vt a2_v = svtbl(in2_v, tbl_exch1b_v);
typename acle<T>::vt b1_v = svext(a2_v, a1_v, (uint64_t)(W<T>::r / 2u));
typename acle<T>::vt b2_v = svext(a1_v, a2_v, (uint64_t)(W<T>::r / 2u));
typename acle<T>::vt out1_v = svtbl(b1_v, tbl_exch1c_v);
typename acle<T>::vt out2_v = svtbl(b2_v, tbl_exch1a_v);
svst1(pg1, out1.v, out1_v);
svst1(pg1, out2.v, out2_v);
}
template <typename T>
static inline void Exchange2(vec<T> &out1, vec<T> &out2, const vec<T> &in1, const vec<T> &in2){
svbool_t pg1 = acle<double>::pg1();
typename acle<double>::vt a1_v = svld1(pg1, (typename acle<double>::pt*)in1.v);
typename acle<double>::vt a2_v = svld1(pg1, (typename acle<double>::pt*)in2.v);
typename acle<double>::vt r1_v = svtrn1(a1_v, a2_v);
typename acle<double>::vt r2_v = svtrn2(a1_v, a2_v);
svst1(pg1, (typename acle<double>::pt*)out1.v, r1_v);
svst1(pg1, (typename acle<double>::pt*)out2.v, r2_v);
}
static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){
svbool_t pg1 = acle<float>::pg1();
typename acle<float>::vt a1_v = svld1(pg1, in1.v);
typename acle<float>::vt a2_v = svld1(pg1, in2.v);
typename acle<float>::vt r1_v = svtrn1(a1_v, a2_v);
typename acle<float>::vt r2_v = svtrn2(a1_v, a2_v);
svst1(pg1, out1.v, r1_v);
svst1(pg1, out2.v, r2_v);
}
static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){
assert(0);
return;
}
};
struct Permute{
// Permute0 is valid for any SVE vector width
template <typename T>
static inline vec<T> Permute0(vec<T> in) {
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, in.v);
typename acle<T>::vt r_v = svext(a_v, a_v, (uint64_t)(W<T>::r / 2u));
svst1(pg1, out.v, r_v);
return out;
}
static inline vecd Permute1(vecd in) {
vecd out;
const vec<typename acle<double>::uint> tbl_swap = acle<double>::tbl1();
svbool_t pg1 = acle<double>::pg1();
typename acle<double>::vt a_v = svld1(pg1, in.v);
typename acle<double>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
typename acle<double>::vt r_v = svtbl(a_v, tbl_swap_v);
svst1(pg1, out.v, r_v);
return out;
}
static inline vecf Permute1(vecf in) {
vecf out;
const vec<typename acle<float>::uint> tbl_swap = acle<float>::tbl1();
svbool_t pg1 = acle<float>::pg1();
typename acle<float>::vt a_v = svld1(pg1, in.v);
typename acle<float>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
typename acle<float>::vt r_v = svtbl(a_v, tbl_swap_v);
svst1(pg1, out.v, r_v);
return out;
}
static inline vecd Permute2(vecd in) {
vecd out;
const vec<typename acle<double>::uint> tbl_swap = acle<double>::tbl_swap();
svbool_t pg1 = acle<double>::pg1();
typename acle<double>::vt a_v = svld1(pg1, in.v);
typename acle<double>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
typename acle<double>::vt r_v = svtbl(a_v, tbl_swap_v);
svst1(pg1, out.v, r_v);
return out;
}
static inline vecf Permute2(vecf in) {
vecf out;
const vec<typename acle<float>::uint> tbl_swap = acle<float>::tbl2();
svbool_t pg1 = acle<float>::pg1();
typename acle<float>::vt a_v = svld1(pg1, in.v);
typename acle<float>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
typename acle<float>::vt r_v = svtbl(a_v, tbl_swap_v);
svst1(pg1, out.v, r_v);
return out;
}
static inline vecf Permute3(vecf in) {
vecf out;
const vec<typename acle<float>::uint> tbl_swap = acle<float>::tbl_swap();
svbool_t pg1 = acle<float>::pg1();
typename acle<float>::vt a_v = svld1(pg1, in.v);
typename acle<float>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v);
typename acle<float>::vt r_v = svtbl(a_v, tbl_swap_v);
svst1(pg1, out.v, r_v);
return out;
}
static inline vecd Permute3(vecd in) {
return in;
}
};
struct Rotate{
template <int n, typename T> static inline vec<T> tRotate(vec<T> in){
vec<T> out;
svbool_t pg1 = acle<T>::pg1();
typename acle<T>::vt a_v = svld1(pg1, in.v);
typename acle<T>::vt r_v = svext(a_v, a_v, (uint64_t)(n%W<T>::r));
svst1(pg1, out.v, r_v);
return out;
}
template <typename T>
static inline vec<T> rotate(vec<T> in, int n){
switch(n){
case 0: return tRotate<0, T>(in); break;
case 1: return tRotate<1, T>(in); break;
case 2: return tRotate<2, T>(in); break;
case 3: return tRotate<3, T>(in); break;
case 4: return tRotate<4, T>(in); break;
case 5: return tRotate<5, T>(in); break;
case 6: return tRotate<6, T>(in); break;
case 7: return tRotate<7, T>(in); break;
case 8: return tRotate<8, T>(in); break;
case 9: return tRotate<9, T>(in); break;
case 10: return tRotate<10, T>(in); break;
case 11: return tRotate<11, T>(in); break;
case 12: return tRotate<12, T>(in); break;
case 13: return tRotate<13, T>(in); break;
case 14: return tRotate<14, T>(in); break;
case 15: return tRotate<15, T>(in); break;
default: assert(0);
}
}
};
// tree-based reduction
#define svred(pg, v)\
svaddv(pg, v);
// left-to-right reduction
// #define svred(pg, v)\
// svadda(pg, 0, v)
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
//Complex float Reduce
template <>
inline Grid::ComplexF Reduce<Grid::ComplexF, vecf>::operator()(vecf in){
svbool_t pg1 = acle<float>::pg1();
svbool_t pg_even = acle<float>::pg_even();
svbool_t pg_odd = acle<float>::pg_odd();
typename acle<float>::vt a_v = svld1(pg1, in.v);
float a = svred(pg_even, a_v);
float b = svred(pg_odd, a_v);
return Grid::ComplexF(a, b);
}
//Real float Reduce
template <>
inline Grid::RealF Reduce<Grid::RealF, vecf>::operator()(vecf in){
svbool_t pg1 = acle<float>::pg1();
typename acle<float>::vt a_v = svld1(pg1, in.v);
float a = svred(pg1, a_v);
return a;
}
//Complex double Reduce
template <>
inline Grid::ComplexD Reduce<Grid::ComplexD, vecd>::operator()(vecd in){
svbool_t pg1 = acle<double>::pg1();
svbool_t pg_even = acle<double>::pg_even();
svbool_t pg_odd = acle<double>::pg_odd();
typename acle<double>::vt a_v = svld1(pg1, in.v);
double a = svred(pg_even, a_v);
double b = svred(pg_odd, a_v);
return Grid::ComplexD(a, b);
}
//Real double Reduce
template <>
inline Grid::RealD Reduce<Grid::RealD, vecd>::operator()(vecd in){
svbool_t pg1 = acle<double>::pg1();
typename acle<double>::vt a_v = svld1(pg1, in.v);
double a = svred(pg1, a_v);
return a;
}
//Integer Reduce
template <>
inline Integer Reduce<Integer, veci>::operator()(veci in){
svbool_t pg1 = acle<Integer>::pg1();
typename acle<Integer>::vt a_v = svld1(pg1, in.v);
Integer a = svred(pg1, a_v);
return a;
}
#undef svred
#undef vec_imm
NAMESPACE_END(Optimization)
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
typedef Optimization::vech SIMD_Htype; // Reduced precision type
typedef Optimization::vecf SIMD_Ftype; // Single precision type
typedef Optimization::vecd SIMD_Dtype; // Double precision type
typedef Optimization::veci SIMD_Itype; // Integer type
// prefetch utilities
inline void v_prefetch0(int size, const char *ptr){};
inline void prefetch_HINT_T0(const char *ptr){};
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultAddComplex MultAddComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
NAMESPACE_END(Grid)

View File

@ -1,769 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Grid_a64fx-fixedsize.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
with support from Arm
Richard Sandiford <richard.sandiford@arm.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
/////////////////////////////////////////////////////
// Using SVE ACLE with fixed-size data types
/////////////////////////////////////////////////////
// gcc 10 features
#if __ARM_FEATURE_SVE_BITS==512
/* gcc 10.0.1 and gcc 10.1 bug using ACLE data types CAS-159553-Y1K4C6
workaround: use gcc's internal data types, bugfix expected for gcc 10.2
typedef svbool_t pred __attribute__((arm_sve_vector_bits(512)));
typedef svfloat16_t vech __attribute__((arm_sve_vector_bits(512)));
typedef svfloat32_t vecf __attribute__((arm_sve_vector_bits(512)));
typedef svfloat64_t vecd __attribute__((arm_sve_vector_bits(512)));
typedef svuint32_t veci __attribute__((arm_sve_vector_bits(512)));
typedef svuint32_t lutf __attribute__((arm_sve_vector_bits(512))); // LUTs for float
typedef svuint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for double
*/
typedef __SVBool_t pred __attribute__((arm_sve_vector_bits(512)));
typedef __SVFloat16_t vech __attribute__((arm_sve_vector_bits(512)));
typedef __SVFloat32_t vecf __attribute__((arm_sve_vector_bits(512)));
typedef __SVFloat64_t vecd __attribute__((arm_sve_vector_bits(512)));
typedef __SVUint32_t veci __attribute__((arm_sve_vector_bits(512)));
typedef __SVUint32_t lutf __attribute__((arm_sve_vector_bits(512))); // LUTs for float
typedef __SVUint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for double
#else
#pragma error("Oops. Illegal SVE vector size!?")
#endif /* __ARM_FEATURE_SVE_BITS */
// low-level API
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Optimization);
// convenience union types for tables eliminating loads
union ulutf {
lutf v;
uint32_t s[16];
};
union ulutd {
lutd v;
uint64_t s[8];
};
template <typename T>
struct acle{};
template <>
struct acle<double>{
static inline lutd tbl_swap(){
const ulutd t = { .s = {1, 0, 3, 2, 5, 4, 7, 6} };
return t.v;
}
static inline lutd tbl0(){
const ulutd t = { .s = {4, 5, 6, 7, 0, 1, 2, 3} };
return t.v;
}
static inline lutd tbl1(){
const ulutd t = { .s = {2, 3, 0, 1, 6, 7, 4, 5} };
return t.v;
}
static inline lutd tbl_exch1a(){ // Exchange1
const ulutd t = { .s = {0, 1, 4, 5, 2, 3, 6, 7} };
return t.v;
}
static inline lutd tbl_exch1b(){ // Exchange1
const ulutd t = { .s = {2, 3, 6, 7, 0, 1, 4, 5} };
return t.v;
}
static inline lutd tbl_exch1c(){ // Exchange1
const ulutd t = { .s = {4, 5, 0, 1, 6, 7, 2, 3} };
return t.v;
}
static inline pred pg1(){return svptrue_b64();}
static inline pred pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());}
static inline pred pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());}
static inline vecd zero(){return svdup_f64(0.);}
};
template <>
struct acle<float>{
// exchange neighboring elements
static inline lutf tbl_swap(){
const ulutf t = { .s = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} };
return t.v;
}
static inline lutf tbl0(){
const ulutf t = { .s = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7} };
return t.v;
}
static inline lutf tbl1(){
const ulutf t = { .s = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11} };
return t.v;
}
static inline lutf tbl2(){
const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} };
return t.v;
}
static inline lutf tbl_exch1a(){ // Exchange1
const ulutf t = { .s = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 } };
return t.v;
}
static inline lutf tbl_exch1b(){ // Exchange1
const ulutf t = { .s = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 } };
return t.v;
}
static inline lutf tbl_exch1c(){ // Exchange1
const ulutf t = { .s = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7} };
return t.v;
}
static inline pred pg1(){return svptrue_b32();}
static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());}
static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());}
static inline vecf zero(){return svdup_f32(0.);}
};
template <>
struct acle<uint16_t>{
static inline pred pg1(){return svptrue_b16();}
static inline pred pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());}
static inline pred pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());}
static inline vech zero(){return svdup_f16(0.);}
};
template <>
struct acle<Integer>{
//static inline svbool_t pg1(){return svptrue_b16();}
static inline pred pg1(){return svptrue_b32();}
static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());}
static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());}
};
// ---------------------------------------------------
struct Vsplat{
// Complex float
inline vecf operator()(float a, float b){
vecf a_v = svdup_f32(a);
vecf b_v = svdup_f32(b);
return svzip1(a_v, b_v);
}
// Real float
inline vecf operator()(float a){
return svdup_f32(a);
}
// Complex double
inline vecd operator()(double a, double b){
vecd a_v = svdup_f64(a);
vecd b_v = svdup_f64(b);
return svzip1(a_v, b_v);
}
// Real double
inline vecd operator()(double a){
return svdup_f64(a);
}
// Integer
inline veci operator()(Integer a){
return svdup_u32(a);
}
};
struct Vstore{
// Real float
inline void operator()(vecf a, float *D){
pred pg1 = acle<float>::pg1();
svst1(pg1, D, a);
}
// Real double
inline void operator()(vecd a, double *D){
pred pg1 = acle<double>::pg1();
svst1(pg1, D, a);
}
// Real float
inline void operator()(veci a, Integer *D){
pred pg1 = acle<Integer>::pg1();
svst1(pg1, D, a);
}
};
struct Vstream{
// Real float
inline void operator()(float * a, vecf b){
pred pg1 = acle<float>::pg1();
svstnt1(pg1, a, b);
//svst1(pg1, a, b);
}
// Real double
inline void operator()(double * a, vecd b){
pred pg1 = acle<double>::pg1();
svstnt1(pg1, a, b);
//svst1(pg1, a, b);
}
};
struct Vset{
// Complex float
inline vecf operator()(Grid::ComplexF *a){
pred pg1 = acle<float>::pg1();
return svld1(pg1, (float*)a);
}
// Complex double
inline vecd operator()(Grid::ComplexD *a){
pred pg1 = acle<double>::pg1();
return svld1(pg1, (double*)a);
}
// Real float
inline vecf operator()(float *a){
pred pg1 = acle<float>::pg1();
return svld1(pg1, a);
}
// Real double
inline vecd operator()(double *a){
pred pg1 = acle<double>::pg1();
return svld1(pg1, a);
}
// Integer
inline veci operator()(Integer *a){
pred pg1 = acle<Integer>::pg1();
return svld1(pg1, a);
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
// Complex/real float
inline vecf operator()(vecf a, vecf b){
pred pg1 = acle<float>::pg1();
return svadd_x(pg1, a, b);
}
// Complex/real double
inline vecd operator()(vecd a, vecd b){
pred pg1 = acle<double>::pg1();
return svadd_x(pg1, a, b);
}
// Integer
inline veci operator()(veci a, veci b){
pred pg1 = acle<Integer>::pg1();
return svadd_x(pg1, a, b);
}
};
struct Sub{
// Complex/real float
inline vecf operator()(vecf a, vecf b){
pred pg1 = acle<float>::pg1();
return svsub_x(pg1, a, b);
}
// Complex/real double
inline vecd operator()(vecd a, vecd b){
pred pg1 = acle<double>::pg1();
return svsub_x(pg1, a, b);
}
// Integer
inline veci operator()(veci a, veci b){
pred pg1 = acle<Integer>::pg1();
return svsub_x(pg1, a, b);
}
};
struct Mult{
// Real float fma
inline vecf operator()(vecf a, vecf b, vecf c){
pred pg1 = acle<float>::pg1();
return svmad_x(pg1, b, c, a);
}
// Real double fma
inline vecd operator()(vecd a, vecd b, vecd c){
pred pg1 = acle<double>::pg1();
return svmad_x(pg1, b, c, a);
}
// Real float
inline vecf operator()(vecf a, vecf b){
pred pg1 = acle<float>::pg1();
return svmul_x(pg1, a, b);
}
// Real double
inline vecd operator()(vecd a, vecd b){
pred pg1 = acle<double>::pg1();
return svmul_x(pg1, a, b);
}
// Integer
inline veci operator()(veci a, veci b){
pred pg1 = acle<Integer>::pg1();
return svmul_x(pg1, a, b);
}
};
struct MultRealPart{
// Complex float
inline vecf operator()(vecf a, vecf b){
pred pg1 = acle<float>::pg1();
// using FCMLA
vecf z_v = acle<float>::zero();
return svcmla_x(pg1, z_v, a, b, 0);
}
// Complex double
inline vecd operator()(vecd a, vecd b){
pred pg1 = acle<double>::pg1();
// using FCMLA
vecd z_v = acle<double>::zero();
return svcmla_x(pg1, z_v, a, b, 0);
}
};
struct MaddRealPart{
// Complex float
inline vecf operator()(vecf a, vecf b, vecf c){
pred pg1 = acle<float>::pg1();
// using FCMLA
return svcmla_x(pg1, c, a, b, 0);
}
// Complex double
inline vecd operator()(vecd a, vecd b, vecd c){
pred pg1 = acle<double>::pg1();
// using FCMLA
return svcmla_x(pg1, c, a, b, 0);
}
};
struct MultComplex{
// Complex a*b
// Complex float
inline vecf operator()(vecf a, vecf b){
pred pg1 = acle<float>::pg1();
vecf z = acle<float>::zero();
// using FCMLA
vecf r_v = svcmla_x(pg1, z, a, b, 0);
return svcmla_x(pg1, r_v, a, b, 90);
}
// Complex double
inline vecd operator()(vecd a, vecd b){
pred pg1 = acle<double>::pg1();
vecd z = acle<double>::zero();
// using FCMLA
vecd r_v = svcmla_x(pg1, z, a, b, 0);
return svcmla_x(pg1, r_v, a, b, 90);
}
};
struct MultAddComplex{
// Complex a*b+c
// Complex float
inline vecf operator()(vecf a, vecf b, vecf c){
pred pg1 = acle<float>::pg1();
// using FCMLA
vecf r_v = svcmla_x(pg1, c, a, b, 0);
return svcmla_x(pg1, r_v, a, b, 90);
}
// Complex double
inline vecd operator()(vecd a, vecd b, vecd c){
pred pg1 = acle<double>::pg1();
// using FCMLA
vecd r_v = svcmla_x(pg1, c, a, b, 0);
return svcmla_x(pg1, r_v, a, b, 90);
}
};
struct Div{
// Real float
inline vecf operator()(vecf a, vecf b){
pred pg1 = acle<float>::pg1();
return svdiv_x(pg1, a, b);
}
// Real double
inline vecd operator()(vecd a, vecd b){
pred pg1 = acle<double>::pg1();
return svdiv_x(pg1, a, b);
}
};
struct Conj{
// Complex float
inline vecf operator()(vecf a){
pred pg_odd = acle<float>::pg_odd();
//return svneg_x(pg_odd, a); this is unsafe
return svneg_m(a, pg_odd, a);
}
// Complex double
inline vecd operator()(vecd a){
pred pg_odd = acle<double>::pg_odd();
//return svneg_x(pg_odd, a); this is unsafe
return svneg_m(a, pg_odd, a);
}
};
struct TimesMinusI{
// Complex float
inline vecf operator()(vecf a, vecf b){
lutf tbl_swap = acle<float>::tbl_swap();
pred pg1 = acle<float>::pg1();
pred pg_odd = acle<float>::pg_odd();
vecf a_v = svtbl(a, tbl_swap);
//return svneg_x(pg_odd, a_v); this is unsafe
return svneg_m(a_v, pg_odd, a_v);
}
// Complex double
inline vecd operator()(vecd a, vecd b){
lutd tbl_swap = acle<double>::tbl_swap();
pred pg1 = acle<double>::pg1();
pred pg_odd = acle<double>::pg_odd();
vecd a_v = svtbl(a, tbl_swap);
//return svneg_x(pg_odd, a_v); this is unsafe
return svneg_m(a_v, pg_odd, a_v);
}
};
struct TimesI{
// Complex float
inline vecf operator()(vecf a, vecf b){
lutf tbl_swap = acle<float>::tbl_swap();
pred pg1 = acle<float>::pg1();
pred pg_even = acle<float>::pg_even();
vecf a_v = svtbl(a, tbl_swap);
//return svneg_x(pg_even, a_v); this is unsafe
return svneg_m(a_v, pg_even, a_v);
}
// Complex double
inline vecd operator()(vecd a, vecd b){
lutd tbl_swap = acle<double>::tbl_swap();
pred pg1 = acle<double>::pg1();
pred pg_even = acle<double>::pg_even();
vecd a_v = svtbl(a, tbl_swap);
//return svneg_x(pg_even, a_v); this is unsafe
return svneg_m(a_v, pg_even, a_v);
}
};
struct PrecisionChange {
static inline vech StoH (vecf sa, vecf sb) {
pred pg1s = acle<float>::pg1();
vech ha_v = svcvt_f16_x(pg1s, sa);
vech hb_v = svcvt_f16_x(pg1s, sb);
return svuzp1(ha_v, hb_v);
}
static inline void HtoS(vech h,vecf &sa,vecf &sb) {
pred pg1s = acle<float>::pg1();
vech ha_v = svzip1(h, h);
vech hb_v = svzip2(h, h);
sa = svcvt_f32_x(pg1s, ha_v);
sb = svcvt_f32_x(pg1s, hb_v);
}
static inline vecf DtoS (vecd a,vecd b) {
pred pg1d = acle<double>::pg1();
vecf sa_v = svcvt_f32_x(pg1d, a);
vecf sb_v = svcvt_f32_x(pg1d, b);
return svuzp1(sa_v, sb_v);
}
static inline void StoD (vecf s,vecd &a,vecd &b) {
pred pg1d = acle<double>::pg1();
vecf sa_v = svzip1(s, s);
vecf sb_v = svzip2(s, s);
a = svcvt_f64_x(pg1d, sa_v);
b = svcvt_f64_x(pg1d, sb_v);
}
static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) {
pred pg1d = acle<double>::pg1();
pred pg1h = acle<uint16_t>::pg1();
vech ha_v = svcvt_f16_x(pg1d, a);
vech hb_v = svcvt_f16_x(pg1d, b);
vech hc_v = svcvt_f16_x(pg1d, c);
vech hd_v = svcvt_f16_x(pg1d, d);
vech hab_v = svuzp1(ha_v, hb_v);
vech hcd_v = svuzp1(hc_v, hd_v);
return svuzp1(hab_v, hcd_v);
/*
vecf sa,sb;
sa = DtoS(a,b);
sb = DtoS(c,d);
return StoH(sa,sb);
*/
}
static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) {
pred pg1h = acle<uint16_t>::pg1();
pred pg1d = acle<double>::pg1();
vech sa_v = svzip1(h, h);
vech sb_v = svzip2(h, h);
vech da_v = svzip1(sa_v, sa_v);
vech db_v = svzip2(sa_v, sa_v);
vech dc_v = svzip1(sb_v, sb_v);
vech dd_v = svzip2(sb_v, sb_v);
a = svcvt_f64_x(pg1d, da_v);
b = svcvt_f64_x(pg1d, db_v);
c = svcvt_f64_x(pg1d, dc_v);
d = svcvt_f64_x(pg1d, dd_v);
/*
vecf sa,sb;
HtoS(h,sa,sb);
StoD(sa,a,b);
StoD(sb,c,d);
*/
}
};
struct Exchange{
// float
static inline void Exchange0(vecf &out1, vecf &out2, vecf in1, vecf in2){
vecf r1_v = svext(in1, in1, (uint64_t)8u);
vecf r2_v = svext(in2, in2, (uint64_t)8u);
out1 = svext(r1_v, in2, (uint64_t)8u);
out2 = svext(in1, r2_v, (uint64_t)8u);
}
static inline void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2){
// this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1
// alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI
lutf tbl_exch1a = acle<float>::tbl_exch1a();
lutf tbl_exch1b = acle<float>::tbl_exch1b();
lutf tbl_exch1c = acle<float>::tbl_exch1c();
vecf a1_v = svtbl(in1, tbl_exch1a);
vecf a2_v = svtbl(in2, tbl_exch1b);
vecf b1_v = svext(a2_v, a1_v, (uint64_t)8u);
vecf b2_v = svext(a1_v, a2_v, (uint64_t)8u);
out1 = svtbl(b1_v, tbl_exch1c);
out2 = svtbl(b2_v, tbl_exch1a);
}
static inline void Exchange2(vecf &out1, vecf &out2, vecf in1, vecf in2){
out1 = (vecf)svtrn1((vecd)in1, (vecd)in2);
out2 = (vecf)svtrn2((vecd)in1, (vecd)in2);
}
static inline void Exchange3(vecf &out1, vecf &out2, vecf in1, vecf in2){
out1 = svtrn1(in1, in2);
out2 = svtrn2(in1, in2);
}
// double
static inline void Exchange0(vecd &out1, vecd &out2, vecd in1, vecd in2){
vecd r1_v = svext(in1, in1, (uint64_t)4u);
vecd r2_v = svext(in2, in2, (uint64_t)4u);
out1 = svext(r1_v, in2, (uint64_t)4u);
out2 = svext(in1, r2_v, (uint64_t)4u);
}
static inline void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2){
// this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1
// alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI
lutd tbl_exch1a = acle<double>::tbl_exch1a();
lutd tbl_exch1b = acle<double>::tbl_exch1b();
lutd tbl_exch1c = acle<double>::tbl_exch1c();
vecd a1_v = svtbl(in1, tbl_exch1a);
vecd a2_v = svtbl(in2, tbl_exch1b);
vecd b1_v = svext(a2_v, a1_v, (uint64_t)4u);
vecd b2_v = svext(a1_v, a2_v, (uint64_t)4u);
out1 = svtbl(b1_v, tbl_exch1c);
out2 = svtbl(b2_v, tbl_exch1a);
}
static inline void Exchange2(vecd &out1, vecd &out2, vecd in1, vecd in2){
out1 = svtrn1(in1, in2);
out2 = svtrn2(in1, in2);
}
static inline void Exchange3(vecd &out1, vecd &out2, vecd in1, vecd in2){
assert(0);
return;
}
};
#undef VECTOR_FOR
struct Permute{
// float
static inline vecf Permute0(vecf in) {
return svext(in, in, (uint64_t)8u);
}
static inline vecf Permute1(vecf in) {
lutf tbl_swap = acle<float>::tbl1();
return svtbl(in, tbl_swap);
}
static inline vecf Permute2(vecf in) {
lutf tbl_swap = acle<float>::tbl2();
return svtbl(in, tbl_swap);
}
static inline vecf Permute3(vecf in) {
lutf tbl_swap = acle<float>::tbl_swap();
return svtbl(in, tbl_swap);
}
// double
static inline vecd Permute0(vecd in) {
return svext(in, in, (uint64_t)(8u / 2u));
}
static inline vecd Permute1(vecd in) {
lutd tbl_swap = acle<double>::tbl1();
return svtbl(in, tbl_swap);
}
static inline vecd Permute2(vecd in) {
lutd tbl_swap = acle<double>::tbl_swap();
return svtbl(in, tbl_swap);
}
static inline vecd Permute3(vecd in) {
return in;
}
};
struct Rotate{
static inline vecf rotate(vecf in, int n){
switch(n){
case 0: return tRotate<0>(in); break;
case 1: return tRotate<1>(in); break;
case 2: return tRotate<2>(in); break;
case 3: return tRotate<3>(in); break;
case 4: return tRotate<4>(in); break;
case 5: return tRotate<5>(in); break;
case 6: return tRotate<6>(in); break;
case 7: return tRotate<7>(in); break;
case 8: return tRotate<8>(in); break;
case 9: return tRotate<9>(in); break;
case 10: return tRotate<10>(in); break;
case 11: return tRotate<11>(in); break;
case 12: return tRotate<12>(in); break;
case 13: return tRotate<13>(in); break;
case 14: return tRotate<14>(in); break;
case 15: return tRotate<15>(in); break;
default: assert(0);
}
}
static inline vecd rotate(vecd in, int n){
switch(n){
case 0: return tRotate<0>(in); break;
case 1: return tRotate<1>(in); break;
case 2: return tRotate<2>(in); break;
case 3: return tRotate<3>(in); break;
case 4: return tRotate<4>(in); break;
case 5: return tRotate<5>(in); break;
case 6: return tRotate<6>(in); break;
case 7: return tRotate<7>(in); break;
default: assert(0);
}
}
template <int n> static inline vecf tRotate(vecf in){
return svext(in, in, (uint64_t)n);
}
template <int n> static inline vecd tRotate(vecd in){
return svext(in, in, (uint64_t)n);
}
};
// tree-based reduction
#define svred(pg, v)\
svaddv(pg, v);
// left-to-right reduction
// #define svred(pg, v)\
// svadda(pg, 0, v)
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
//exit(1);
return 0;
}
};
//Complex float Reduce
template <>
inline Grid::ComplexF Reduce<Grid::ComplexF, vecf>::operator()(vecf in){
pred pg_even = acle<float>::pg_even();
pred pg_odd = acle<float>::pg_odd();
float a = svred(pg_even, in);
float b = svred(pg_odd, in);
return Grid::ComplexF(a, b);
}
//Real float Reduce
template <>
inline Grid::RealF Reduce<Grid::RealF, vecf>::operator()(vecf in){
pred pg1 = acle<float>::pg1();
return svred(pg1, in);
}
//Complex double Reduce
template <>
inline Grid::ComplexD Reduce<Grid::ComplexD, vecd>::operator()(vecd in){
pred pg_even = acle<double>::pg_even();
pred pg_odd = acle<double>::pg_odd();
double a = svred(pg_even, in);
double b = svred(pg_odd, in);
return Grid::ComplexD(a, b);
}
//Real double Reduce
template <>
inline Grid::RealD Reduce<Grid::RealD, vecd>::operator()(vecd in){
pred pg1 = acle<double>::pg1();
return svred(pg1, in);
}
//Integer Reduce
template <>
inline Integer Reduce<Integer, veci>::operator()(veci in){
pred pg1 = acle<Integer>::pg1();
return svred(pg1, in);
}
#undef svred
NAMESPACE_END(Optimization);
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
typedef vech SIMD_Htype; // Reduced precision type
typedef vecf SIMD_Ftype; // Single precision type
typedef vecd SIMD_Dtype; // Double precision type
typedef veci SIMD_Itype; // Integer type
// prefetch utilities
inline void v_prefetch0(int size, const char *ptr){};
inline void prefetch_HINT_T0(const char *ptr){};
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultAddComplex MultAddComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
NAMESPACE_END(Grid);

View File

@ -41,11 +41,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
typedef struct { uint16_t x;} half;
#endif
typedef struct Half2_t { half x; half y; } Half2;
#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH ) #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
template<class pair> template<class pair>
@ -130,14 +125,14 @@ inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const
} }
constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half); constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half);
constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2); constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2);
constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float); constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float);
constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2); constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2);
constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double); constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double);
constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2); constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2);
constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer); constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer);
typedef GpuComplex<Half2 > GpuComplexH; typedef GpuComplex<half2 > GpuComplexH;
typedef GpuComplex<float2 > GpuComplexF; typedef GpuComplex<float2 > GpuComplexF;
typedef GpuComplex<double2> GpuComplexD; typedef GpuComplex<double2> GpuComplexD;
@ -152,9 +147,11 @@ typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
accelerator_inline float half2float(half h) accelerator_inline float half2float(half h)
{ {
float f; float f;
#if defined(GRID_CUDA) || defined(GRID_HIP) #ifdef GRID_SIMT
f = __half2float(h); f = __half2float(h);
#else #else
//f = __half2float(h);
__half_raw hr(h);
Grid_half hh; Grid_half hh;
hh.x = hr.x; hh.x = hr.x;
f= sfw_half_to_float(hh); f= sfw_half_to_float(hh);
@ -164,11 +161,13 @@ accelerator_inline float half2float(half h)
accelerator_inline half float2half(float f) accelerator_inline half float2half(float f)
{ {
half h; half h;
#if defined(GRID_CUDA) || defined(GRID_HIP) #ifdef GRID_SIMT
h = __float2half(f); h = __float2half(f);
#else #else
Grid_half hh = sfw_float_to_half(f); Grid_half hh = sfw_float_to_half(f);
h.x = hh.x; __half_raw hr;
hr.x = hh.x;
h = __half(hr);
#endif #endif
return h; return h;
} }
@ -524,7 +523,7 @@ namespace Optimization {
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
// Single / Half // Single / Half
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
int N = GpuVectorCF::N; int N = GpuVectorCF::N;
GpuVectorCH h; GpuVectorCH h;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {

View File

@ -110,63 +110,9 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) {
#ifdef GPU_VEC #ifdef GPU_VEC
#include "Grid_gpu_vec.h" #include "Grid_gpu_vec.h"
#endif #endif
/*
#ifdef GEN #ifdef GEN
#include "Grid_generic.h" #include "Grid_generic.h"
#endif #endif
*/
#ifdef GEN
#if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here
#include <arm_sve.h>
#if defined(A64FX) // VLA
#pragma message("building A64FX / SVE ACLE VLA")
#if defined(ARMCLANGCOMPAT)
#pragma message("applying data types patch")
#endif
#include "Grid_a64fx-2.h"
#endif
#if defined(A64FXFIXEDSIZE) // fixed size data types
#pragma message("building for A64FX / SVE ACLE fixed size")
#include "Grid_a64fx-fixedsize.h"
#endif
#else
//#pragma message("building GEN") // generic
#include "Grid_generic.h"
#endif
#endif
#ifdef A64FX
#include <arm_sve.h>
#ifdef __ARM_FEATURE_SVE_BITS
//#pragma message("building A64FX SVE VLS")
#include "Grid_a64fx-fixedsize.h"
#else
#pragma message("building A64FX SVE VLA")
#if defined(ARMCLANGCOMPAT)
#pragma message("applying data types patch")
#endif
#include "Grid_a64fx-2.h"
#endif
#endif
/*
#ifdef A64FXVLA
#pragma message("building A64FX VLA")
#if defined(ARMCLANGCOMPAT)
#pragma message("applying data types patch")
#endif
#include <arm_sve.h>
#include "Grid_a64fx-2.h"
#endif
#ifdef A64FXVLS
#pragma message("building A64FX VLS")
#include <arm_sve.h>
#include "Grid_a64fx-fixedsize.h"
#endif
*/
#ifdef SSE4 #ifdef SSE4
#include "Grid_sse4.h" #include "Grid_sse4.h"
#endif #endif
@ -217,12 +163,6 @@ template <typename T> struct is_complex : public std::false_type {};
template <> struct is_complex<ComplexD> : public std::true_type {}; template <> struct is_complex<ComplexD> : public std::true_type {};
template <> struct is_complex<ComplexF> : public std::true_type {}; template <> struct is_complex<ComplexF> : public std::true_type {};
template <typename T> struct is_ComplexD : public std::false_type {};
template <> struct is_ComplexD<ComplexD> : public std::true_type {};
template <typename T> struct is_ComplexF : public std::false_type {};
template <> struct is_ComplexF<ComplexF> : public std::true_type {};
template<typename T, typename V=void> struct is_real : public std::false_type {}; template<typename T, typename V=void> struct is_real : public std::false_type {};
template<typename T> struct is_real<T, typename std::enable_if<std::is_floating_point<T>::value, template<typename T> struct is_real<T, typename std::enable_if<std::is_floating_point<T>::value,
void>::type> : public std::true_type {}; void>::type> : public std::true_type {};
@ -283,69 +223,6 @@ public:
return sizeof(Vector_type) / sizeof(Scalar_type); return sizeof(Vector_type) / sizeof(Scalar_type);
} }
#ifdef ARMCLANGCOMPAT
template <class S = Scalar_type>
accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &&rhs) {
//v = rhs.v;
svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v)));
return *this;
};
template <class S = Scalar_type>
accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &rhs) {
//v = rhs.v;
svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v)));
return *this;
};
/*
template <class S = Scalar_type>
accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &&rhs) {
//v = rhs.v;
svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v)));
return *this;
};
template <class S = Scalar_type>
accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &rhs) {
//v = rhs.v;
svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v)));
return *this;
};
*/
// ComplexF
template <class S = Scalar_type>
accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_ComplexF<S>::value, S>::type, Vector_type> &&rhs) {
//v = rhs.v;
svst1(svptrue_b32(), (float*)this, svld1(svptrue_b32(), (float*)&(rhs.v)));
return *this;
};
template <class S = Scalar_type>
accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_ComplexF<S>::value, S>::type, Vector_type> &rhs) {
//v = rhs.v;
svst1(svptrue_b32(), (float*)this, svld1(svptrue_b32(), (float*)&(rhs.v)));
return *this;
};
// ComplexD
template <class S = Scalar_type>
accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_ComplexD<S>::value, S>::type, Vector_type> &&rhs) {
//v = rhs.v;
svst1(svptrue_b64(), (double*)this, svld1(svptrue_b64(), (double*)&(rhs.v)));
return *this;
};
template <class S = Scalar_type>
accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_ComplexD<S>::value, S>::type, Vector_type> &rhs) {
//v = rhs.v;
svst1(svptrue_b64(), (double*)this, svld1(svptrue_b64(), (double*)&(rhs.v)));
return *this;
};
#else
accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) { accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) {
v = rhs.v; v = rhs.v;
return *this; return *this;
@ -355,23 +232,10 @@ public:
return *this; return *this;
}; // faster than not declaring it and leaving to the compiler }; // faster than not declaring it and leaving to the compiler
#endif
accelerator Grid_simd() = default; accelerator Grid_simd() = default;
accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps
#ifdef ARMCLANGCOMPAT accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
template <class S = Scalar_type>
accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); }
template <class S = Scalar_type>
accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); }
template <class S = Scalar_type>
accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); }
template <class S = Scalar_type>
accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); }
#else
accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps
accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
#endif
accelerator_inline Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); }; accelerator_inline Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
// Enable if complex type // Enable if complex type
template <typename S = Scalar_type> accelerator_inline template <typename S = Scalar_type> accelerator_inline
@ -394,20 +258,11 @@ public:
/////////////////////////////////////////////// ///////////////////////////////////////////////
// FIXME -- alias this to an accelerator_inline MAC struct. // FIXME -- alias this to an accelerator_inline MAC struct.
#if defined(A64FX) || defined(A64FXFIXEDSIZE)
friend accelerator_inline void mac(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ a,
const Grid_simd *__restrict__ x) {
*y = fxmac((*a), (*x), (*y));
};
#else
friend accelerator_inline void mac(Grid_simd *__restrict__ y, friend accelerator_inline void mac(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ a,
const Grid_simd *__restrict__ x) { const Grid_simd *__restrict__ x) {
*y = (*a) * (*x) + (*y); *y = (*a) * (*x) + (*y);
}; };
#endif
friend accelerator_inline void mult(Grid_simd *__restrict__ y, friend accelerator_inline void mult(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ l,
@ -886,27 +741,6 @@ accelerator_inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V>
return ret; return ret;
}; };
// ---------------- A64FX MAC -------------------
// Distinguish between complex types and others
#if defined(A64FX) || defined(A64FXFIXEDSIZE)
template <class S, class V, IfComplex<S> = 0>
accelerator_inline Grid_simd<S, V> fxmac(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S, V> c) {
Grid_simd<S, V> ret;
ret.v = trinary<V>(a.v, b.v, c.v, MultAddComplexSIMD());
return ret;
};
// Real/Integer types
template <class S, class V, IfNotComplex<S> = 0>
accelerator_inline Grid_simd<S, V> fxmac(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S, V> c) {
Grid_simd<S, V> ret;
ret.v = trinary<V>(a.v, b.v, c.v, MultSIMD());
return ret;
};
#endif
// ----------------------------------------------
// Distinguish between complex types and others // Distinguish between complex types and others
template <class S, class V, IfComplex<S> = 0> template <class S, class V, IfComplex<S> = 0>
accelerator_inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) { accelerator_inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
@ -1085,14 +919,6 @@ accelerator_inline void precisionChange(vRealD *out,vRealF *in,int nvec)
for(int m=0;m*2<nvec;m++){ for(int m=0;m*2<nvec;m++){
int n=m*2; int n=m*2;
Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v); Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v);
// Bug in gcc 10.0.1 and gcc 10.1 using fixed-size SVE ACLE data types CAS-159553-Y1K4C6
// function call results in compile-time error:
// In function void Grid::precisionChange(Grid::vRealD*, Grid::vRealF*, int):
// .../Grid_vector_types.h:961:56: error:
// cannot bind non-const lvalue reference of type vecd& {aka svfloat64_t&}
// to an rvalue of type vecd {aka svfloat64_t}
// 961 | Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v);
// | ~~~~~~~^
} }
} }
accelerator_inline void precisionChange(vRealD *out,vRealH *in,int nvec) accelerator_inline void precisionChange(vRealD *out,vRealH *in,int nvec)

View File

@ -93,11 +93,6 @@ accelerator_inline ComplexF pow(const ComplexF& r,RealF y){ return(std::pow(r,y)
using std::abs; using std::abs;
using std::pow; using std::pow;
using std::sqrt; using std::sqrt;
using std::log;
using std::exp;
using std::sin;
using std::cos;
accelerator_inline RealF conjugate(const RealF & r){ return r; } accelerator_inline RealF conjugate(const RealF & r){ return r; }
accelerator_inline RealD conjugate(const RealD & r){ return r; } accelerator_inline RealD conjugate(const RealD & r){ return r; }

File diff suppressed because it is too large Load Diff

View File

@ -1307,8 +1307,8 @@ public:
std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl; std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
auto all_bytes = comms_bytes+shm_bytes; auto all_bytes = comms_bytes+shm_bytes;
std::cout << GridLogMessage << " Stencil SHM all " << (all_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl; std::cout << GridLogMessage << " Stencil SHM all" << (all_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
std::cout << GridLogMessage << " Stencil SHM all " << (all_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl; std::cout << GridLogMessage << " Stencil SHM all" << (all_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
auto membytes = (shm_bytes + comms_bytes/2) // read/write auto membytes = (shm_bytes + comms_bytes/2) // read/write
+ (shm_bytes+comms_bytes)/2 * sizeof(vobj)/sizeof(cobj); + (shm_bytes+comms_bytes)/2 * sizeof(vobj)/sizeof(cobj);

View File

@ -272,7 +272,7 @@ public:
static auto traceIndex(const iVector<vtype,N> arg) -> iScalar<RemoveCRV(arg._internal[0])> static auto traceIndex(const iVector<vtype,N> arg) -> iScalar<RemoveCRV(arg._internal[0])>
{ {
iScalar<RemoveCRV(arg._internal[0])> ret; iScalar<RemoveCRV(arg._internal[0])> ret;
zeroit(ret); ret._internal=Zero();
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
ret._internal = ret._internal+ arg._internal[i]; ret._internal = ret._internal+ arg._internal[i];
} }

View File

@ -190,36 +190,6 @@ NAMESPACE_BEGIN(Grid);
typedef ComplexD DoublePrecision; typedef ComplexD DoublePrecision;
typedef ComplexD DoublePrecision2; typedef ComplexD DoublePrecision2;
}; };
#ifdef GRID_CUDA
template<> struct GridTypeMapper<std::complex<float> > : public GridTypeMapper_Base {
typedef std::complex<float> scalar_type;
typedef std::complex<double> scalar_typeD;
typedef scalar_type vector_type;
typedef scalar_typeD vector_typeD;
typedef scalar_type tensor_reduced;
typedef scalar_type scalar_object;
typedef scalar_typeD scalar_objectD;
typedef scalar_type Complexified;
typedef RealF Realified;
typedef scalar_typeD DoublePrecision;
typedef scalar_typeD DoublePrecision2;
};
template<> struct GridTypeMapper<std::complex<double> > : public GridTypeMapper_Base {
typedef std::complex<double> scalar_type;
typedef std::complex<double> scalar_typeD;
typedef scalar_type vector_type;
typedef scalar_typeD vector_typeD;
typedef scalar_type tensor_reduced;
typedef scalar_type scalar_object;
typedef scalar_typeD scalar_objectD;
typedef scalar_type Complexified;
typedef RealD Realified;
typedef scalar_typeD DoublePrecision;
typedef scalar_typeD DoublePrecision2;
};
#endif
template<> struct GridTypeMapper<ComplexD2> : public GridTypeMapper_Base { template<> struct GridTypeMapper<ComplexD2> : public GridTypeMapper_Base {
typedef ComplexD2 scalar_type; typedef ComplexD2 scalar_type;
typedef ComplexD2 scalar_typeD; typedef ComplexD2 scalar_typeD;

View File

@ -16,54 +16,40 @@ void acceleratorInit(void)
char * localRankStr = NULL; char * localRankStr = NULL;
int rank = 0, world_rank=0; int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID"
#define ENV_RANK_SLURM "SLURM_PROCID"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" #define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable // We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
printf("OPENMPI detected\n"); {
rank = atoi(localRankStr); rank = atoi(localRankStr);
} else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { }
printf("MVAPICH detected\n"); if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr); rank = atoi(localRankStr);
} else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) {
printf("SLURM detected\n");
rank = atoi(localRankStr);
} else {
printf("MPI version is unknown - bad things may happen\n");
} }
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
size_t totalDeviceMem=0; size_t totalDeviceMem=0;
for (int i = 0; i < nDevices; i++) { for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit[%d]: " #canMapHostMemory ": " FMT" \n",rank,prop.canMapHostMemory); #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
cudaGetDeviceProperties(&gpu_props[i], i); cudaGetDeviceProperties(&gpu_props[i], i);
cudaDeviceProp prop; cudaDeviceProp prop;
prop = gpu_props[i]; prop = gpu_props[i];
totalDeviceMem = prop.totalGlobalMem; totalDeviceMem = prop.totalGlobalMem;
if ( world_rank == 0) { if ( world_rank == 0) {
#ifndef GRID_IBM_SUMMIT printf("AcceleratorCudaInit: ========================\n");
if ( i==rank ) { printf("AcceleratorCudaInit: Device Number : %d\n", i);
printf("AcceleratorCudaInit[%d]: ========================\n",rank); printf("AcceleratorCudaInit: ========================\n");
printf("AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i); printf("AcceleratorCudaInit: Device identifier: %s\n", prop.name);
printf("AcceleratorCudaInit[%d]: ========================\n",rank);
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
GPU_PROP_FMT(totalGlobalMem,"%lld");
GPU_PROP_FMT(totalGlobalMem,"%lld"); GPU_PROP(managedMemory);
GPU_PROP(managedMemory); GPU_PROP(isMultiGpuBoard);
GPU_PROP(isMultiGpuBoard); GPU_PROP(warpSize);
GPU_PROP(warpSize);
GPU_PROP(pciBusID);
GPU_PROP(pciDeviceID);
}
#endif
// GPU_PROP(unifiedAddressing); // GPU_PROP(unifiedAddressing);
// GPU_PROP(l2CacheSize); // GPU_PROP(l2CacheSize);
// GPU_PROP(singleToDoublePrecisionPerfRatio); // GPU_PROP(singleToDoublePrecisionPerfRatio);
@ -75,9 +61,9 @@ void acceleratorInit(void)
#ifdef GRID_IBM_SUMMIT #ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank // IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - use default device\n"); if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - NOT setting device to node rank\n");
#else #else
printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank); if ( world_rank == 0 ) printf("AcceleratorCudaInit: setting device to node rank\n");
cudaSetDevice(rank); cudaSetDevice(rank);
#endif #endif
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n"); if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
@ -110,24 +96,20 @@ void acceleratorInit(void)
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
printf("world_rank %d has %d devices\n",world_rank,nDevices);
size_t totalDeviceMem=0;
for (int i = 0; i < nDevices; i++) { for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
hipGetDeviceProperties(&gpu_props[i], i); hipGetDeviceProperties(&gpu_props[i], i);
hipDeviceProp_t prop;
prop = gpu_props[i];
totalDeviceMem = prop.totalGlobalMem;
if ( world_rank == 0) { if ( world_rank == 0) {
hipDeviceProp_t prop;
prop = gpu_props[i];
printf("AcceleratorHipInit: ========================\n"); printf("AcceleratorHipInit: ========================\n");
printf("AcceleratorHipInit: Device Number : %d\n", i); printf("AcceleratorHipInit: Device Number : %d\n", i);
printf("AcceleratorHipInit: ========================\n"); printf("AcceleratorHipInit: ========================\n");
printf("AcceleratorHipInit: Device identifier: %s\n", prop.name); printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);
GPU_PROP_FMT(totalGlobalMem,"%lu");
// GPU_PROP(managedMemory); // GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard); GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize); GPU_PROP(warpSize);
@ -136,7 +118,6 @@ void acceleratorInit(void)
// GPU_PROP(singleToDoublePrecisionPerfRatio); // GPU_PROP(singleToDoublePrecisionPerfRatio);
} }
} }
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
#undef GPU_PROP_FMT #undef GPU_PROP_FMT
#undef GPU_PROP #undef GPU_PROP
#ifdef GRID_IBM_SUMMIT #ifdef GRID_IBM_SUMMIT

View File

@ -70,7 +70,6 @@ NAMESPACE_BEGIN(Grid);
// //
// Memory management: // Memory management:
// //
// int acceleratorIsCommunicable(void *pointer);
// void *acceleratorAllocShared(size_t bytes); // void *acceleratorAllocShared(size_t bytes);
// void acceleratorFreeShared(void *ptr); // void acceleratorFreeShared(void *ptr);
// //
@ -91,7 +90,6 @@ void acceleratorInit(void);
////////////////////////////////////////////// //////////////////////////////////////////////
#ifdef GRID_CUDA #ifdef GRID_CUDA
#include <cuda.h>
#ifdef __CUDA_ARCH__ #ifdef __CUDA_ARCH__
#define GRID_SIMT #define GRID_SIMT
@ -167,16 +165,6 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline int acceleratorIsCommunicable(void *ptr)
{
int uvm;
auto
cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
assert(cuerr == cudaSuccess );
if(uvm) return 0;
else return 1;
}
#endif #endif
////////////////////////////////////////////// //////////////////////////////////////////////
@ -231,15 +219,6 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline int acceleratorIsCommunicable(void *ptr)
{
#if 0
auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
else return 0;
#endif
return 1;
}
#endif #endif
@ -307,15 +286,18 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
inline void *acceleratorAllocShared(size_t bytes) inline void *acceleratorAllocShared(size_t bytes)
{ {
#if 0
void *ptr=NULL; void *ptr=NULL;
auto err = hipMallocManaged((void **)&ptr,bytes); auto err = hipMallocManaged((void **)&ptr,bytes);
if( err != hipSuccess ) { if( err != hipSuccess ) {
ptr = (void *) NULL; ptr = (void *) NULL;
printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
} }
return ptr; return ptr;
#else
return malloc(bytes);
#endif
}; };
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
inline void *acceleratorAllocDevice(size_t bytes) inline void *acceleratorAllocDevice(size_t bytes)
{ {
@ -323,7 +305,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
auto err = hipMalloc((void **)&ptr,bytes); auto err = hipMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) { if( err != hipSuccess ) {
ptr = (void *) NULL; ptr = (void *) NULL;
printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
} }
return ptr; return ptr;
}; };
@ -370,7 +352,6 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};

View File

@ -99,10 +99,10 @@ inline std::ostream & operator<<(std::ostream &os, const AcceleratorVector<T,_nd
{ {
os << "["; os << "[";
for(int s=0;s<v.size();s++) { for(int s=0;s<v.size();s++) {
os << v[s]; os << v[s] << " ";
if( s < (v.size()-1) ){ }
os << " "; if (v.size() > 0) {
} os << "\b";
} }
os << "]"; os << "]";
return os; return os;

View File

@ -318,13 +318,6 @@ void Grid_init(int *argc,char ***argv)
Grid_debug_handler_init(); Grid_debug_handler_init();
} }
#if defined(A64FX)
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
std::cout << "Option --comms-overlap currently not supported on QPACE4. Exiting." << std::endl;
exit(EXIT_FAILURE);
}
#endif
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Memory manager // Memory manager
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
@ -377,7 +370,9 @@ void Grid_init(int *argc,char ***argv)
std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl; std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
} }
MemoryManager::InitMessage(); #ifndef GRID_UVM
std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
#endif
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){
MemoryProfiler::debug = true; MemoryProfiler::debug = true;
@ -472,7 +467,7 @@ void Grid_init(int *argc,char ***argv)
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
LebesgueOrder::UseLebesgueOrder=1; LebesgueOrder::UseLebesgueOrder=1;
} }
CartesianCommunicator::nCommThreads = 1; CartesianCommunicator::nCommThreads = -1;
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads"); arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);

33
README
View File

@ -111,10 +111,11 @@ Now you can execute the `configure` script to generate makefiles (here from a bu
``` bash ``` bash
mkdir build; cd build mkdir build; cd build
../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path> ../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
``` ```
where `--enable-simd=` set the SIMD type, `--enable- where `--enable-precision=` set the default precision,
`--enable-simd=` set the SIMD type, `--enable-
comms=`, and `<path>` should be replaced by the prefix path where you want to comms=`, and `<path>` should be replaced by the prefix path where you want to
install Grid. Other options are detailed in the next section, you can also use `configure install Grid. Other options are detailed in the next section, you can also use `configure
--help` to display them. Like with any other program using GNU autotool, the --help` to display them. Like with any other program using GNU autotool, the
@ -145,8 +146,8 @@ If you want to build all the tests at once just use `make tests`.
- `--enable-numa`: enable NUMA first touch optimisation - `--enable-numa`: enable NUMA first touch optimisation
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option** - `--enable-precision={single|double}`: set the default precision (default: `double`).
- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. - `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
- `--disable-timers`: disable system dependent high-resolution timers. - `--disable-timers`: disable system dependent high-resolution timers.
- `--enable-chroma`: enable Chroma regression tests. - `--enable-chroma`: enable Chroma regression tests.
@ -200,7 +201,8 @@ Alternatively, some CPU codenames can be directly used:
The following configuration is recommended for the Intel Knights Landing platform: The following configuration is recommended for the Intel Knights Landing platform:
``` bash ``` bash
../configure --enable-simd=KNL \ ../configure --enable-precision=double\
--enable-simd=KNL \
--enable-comms=mpi-auto \ --enable-comms=mpi-auto \
--enable-mkl \ --enable-mkl \
CXX=icpc MPICXX=mpiicpc CXX=icpc MPICXX=mpiicpc
@ -210,7 +212,8 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash ``` bash
../configure --enable-simd=KNL \ ../configure --enable-precision=double\
--enable-simd=KNL \
--enable-comms=mpi \ --enable-comms=mpi \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -229,7 +232,8 @@ for interior communication. This is the mpi3 communications implementation.
We recommend four ranks per node for best performance, but optimum is local volume dependent. We recommend four ranks per node for best performance, but optimum is local volume dependent.
``` bash ``` bash
../configure --enable-simd=KNL \ ../configure --enable-precision=double\
--enable-simd=KNL \
--enable-comms=mpi3-auto \ --enable-comms=mpi3-auto \
--enable-mkl \ --enable-mkl \
CC=icpc MPICXX=mpiicpc CC=icpc MPICXX=mpiicpc
@ -240,7 +244,8 @@ We recommend four ranks per node for best performance, but optimum is local volu
The following configuration is recommended for the Intel Haswell platform: The following configuration is recommended for the Intel Haswell platform:
``` bash ``` bash
../configure --enable-simd=AVX2 \ ../configure --enable-precision=double\
--enable-simd=AVX2 \
--enable-comms=mpi3-auto \ --enable-comms=mpi3-auto \
--enable-mkl \ --enable-mkl \
CXX=icpc MPICXX=mpiicpc CXX=icpc MPICXX=mpiicpc
@ -257,7 +262,8 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash ``` bash
../configure --enable-simd=AVX2 \ ../configure --enable-precision=double\
--enable-simd=AVX2 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -274,7 +280,8 @@ This is the default.
The following configuration is recommended for the Intel Skylake platform: The following configuration is recommended for the Intel Skylake platform:
``` bash ``` bash
../configure --enable-simd=AVX512 \ ../configure --enable-precision=double\
--enable-simd=AVX512 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
--enable-mkl \ --enable-mkl \
CXX=mpiicpc CXX=mpiicpc
@ -291,7 +298,8 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash ``` bash
../configure --enable-simd=AVX512 \ ../configure --enable-precision=double\
--enable-simd=AVX512 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -322,7 +330,8 @@ and 8 threads per rank.
The following configuration is recommended for the AMD EPYC platform. The following configuration is recommended for the AMD EPYC platform.
``` bash ``` bash
../configure --enable-simd=AVX2 \ ../configure --enable-precision=double\
--enable-simd=AVX2 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
CXX=mpicxx CXX=mpicxx
``` ```

View File

@ -115,10 +115,11 @@ Now you can execute the `configure` script to generate makefiles (here from a bu
``` bash ``` bash
mkdir build; cd build mkdir build; cd build
../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path> ../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
``` ```
where `--enable-simd=` set the SIMD type, `--enable- where `--enable-precision=` set the default precision,
`--enable-simd=` set the SIMD type, `--enable-
comms=`, and `<path>` should be replaced by the prefix path where you want to comms=`, and `<path>` should be replaced by the prefix path where you want to
install Grid. Other options are detailed in the next section, you can also use `configure install Grid. Other options are detailed in the next section, you can also use `configure
--help` to display them. Like with any other program using GNU autotool, the --help` to display them. Like with any other program using GNU autotool, the
@ -149,8 +150,8 @@ If you want to build all the tests at once just use `make tests`.
- `--enable-numa`: enable NUMA first touch optimisation - `--enable-numa`: enable NUMA first touch optimisation
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option** - `--enable-precision={single|double}`: set the default precision (default: `double`).
- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. - `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
- `--disable-timers`: disable system dependent high-resolution timers. - `--disable-timers`: disable system dependent high-resolution timers.
- `--enable-chroma`: enable Chroma regression tests. - `--enable-chroma`: enable Chroma regression tests.
@ -204,7 +205,8 @@ Alternatively, some CPU codenames can be directly used:
The following configuration is recommended for the Intel Knights Landing platform: The following configuration is recommended for the Intel Knights Landing platform:
``` bash ``` bash
../configure --enable-simd=KNL \ ../configure --enable-precision=double\
--enable-simd=KNL \
--enable-comms=mpi-auto \ --enable-comms=mpi-auto \
--enable-mkl \ --enable-mkl \
CXX=icpc MPICXX=mpiicpc CXX=icpc MPICXX=mpiicpc
@ -214,7 +216,8 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash ``` bash
../configure --enable-simd=KNL \ ../configure --enable-precision=double\
--enable-simd=KNL \
--enable-comms=mpi \ --enable-comms=mpi \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -233,7 +236,8 @@ for interior communication. This is the mpi3 communications implementation.
We recommend four ranks per node for best performance, but optimum is local volume dependent. We recommend four ranks per node for best performance, but optimum is local volume dependent.
``` bash ``` bash
../configure --enable-simd=KNL \ ../configure --enable-precision=double\
--enable-simd=KNL \
--enable-comms=mpi3-auto \ --enable-comms=mpi3-auto \
--enable-mkl \ --enable-mkl \
CC=icpc MPICXX=mpiicpc CC=icpc MPICXX=mpiicpc
@ -244,7 +248,8 @@ We recommend four ranks per node for best performance, but optimum is local volu
The following configuration is recommended for the Intel Haswell platform: The following configuration is recommended for the Intel Haswell platform:
``` bash ``` bash
../configure --enable-simd=AVX2 \ ../configure --enable-precision=double\
--enable-simd=AVX2 \
--enable-comms=mpi3-auto \ --enable-comms=mpi3-auto \
--enable-mkl \ --enable-mkl \
CXX=icpc MPICXX=mpiicpc CXX=icpc MPICXX=mpiicpc
@ -261,7 +266,8 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash ``` bash
../configure --enable-simd=AVX2 \ ../configure --enable-precision=double\
--enable-simd=AVX2 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -278,7 +284,8 @@ This is the default.
The following configuration is recommended for the Intel Skylake platform: The following configuration is recommended for the Intel Skylake platform:
``` bash ``` bash
../configure --enable-simd=AVX512 \ ../configure --enable-precision=double\
--enable-simd=AVX512 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
--enable-mkl \ --enable-mkl \
CXX=mpiicpc CXX=mpiicpc
@ -295,7 +302,8 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash ``` bash
../configure --enable-simd=AVX512 \ ../configure --enable-precision=double\
--enable-simd=AVX512 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -326,7 +334,8 @@ and 8 threads per rank.
The following configuration is recommended for the AMD EPYC platform. The following configuration is recommended for the AMD EPYC platform.
``` bash ``` bash
../configure --enable-simd=AVX2 \ ../configure --enable-precision=double\
--enable-simd=AVX2 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
CXX=mpicxx CXX=mpicxx
``` ```

View File

@ -1,89 +0,0 @@
* gcc 10.1 prebuild, QPACE4 interactive login w/ MPI
scl enable gcc-toolset-10 bash
module load mpi/openmpi-aarch64
../configure --enable-simd=A64FX --enable-comms=mpi3 --enable-shm=shmget CXX=mpicxx CC=mpicc
================================== deprecated ================================================
* gcc 10.1 prebuild, QPACE4 interactive login
scl enable gcc-toolset-10 bash
../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
* gcc 10.1 prebuild w/ MPI, QPACE4 interactive login
scl enable gcc-toolset-10 bash
module load mpi/openmpi-aarch64
../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
------------------------------------------------------------------------------
* armclang 20.2 (qp4)
../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN"
------------------------------------------------------------------------------
* gcc 10.0.1 VLA (merlin)
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
* gcc 10.0.1 fixed-size ACLE (merlin)
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
* gcc 10.0.1 fixed-size ACLE (fjt) w/ MPI
export OMPI_CC=gcc-10.0.1
export OMPI_CXX=g++-10.0.1
export MPICH_CC=gcc-10.0.1
export MPICH_CXX=g++-10.0.1
$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt"
--------------------------------------------------------
* armclang 20.0 VLA (merlin)
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
TODO check ARMCLANGCOMPAT
* armclang 20.1 VLA (merlin)
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
TODO check ARMCLANGCOMPAT
* armclang 20.1 VLA (fjt cluster)
../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
TODO check ARMCLANGCOMPAT
* armclang 20.1 VLA w/MPI (fjt cluster)
../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64"
No ARMCLANGCOMPAT -> still correct ?
--------------------------------------------------------
* Fujitsu fcc
../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN"
* Fujitsu fcc w/ MPI
../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"

View File

@ -1,16 +1,8 @@
#include "Benchmark_IO.hpp" #include "Benchmark_IO.hpp"
#ifndef BENCH_IO_LMIN
#define BENCH_IO_LMIN 8
#endif
#ifndef BENCH_IO_LMAX #ifndef BENCH_IO_LMAX
#define BENCH_IO_LMAX 32 #define BENCH_IO_LMAX 40
#endif
#ifndef BENCH_IO_NPASS
#define BENCH_IO_NPASS 10
#endif #endif
using namespace Grid; using namespace Grid;
@ -20,179 +12,37 @@ std::string filestem(const int l)
return "iobench_l" + std::to_string(l); return "iobench_l" + std::to_string(l);
} }
int vol(const int i)
{
return BENCH_IO_LMIN + 2*i;
}
int volInd(const int l)
{
return (l - BENCH_IO_LMIN)/2;
}
template <typename Mat>
void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
{
auto nr = data[0].rows(), nc = data[0].cols();
Eigen::MatrixXd sqSum(nr, nc);
double n = static_cast<double>(data.size());
assert(n > 1.);
mean = Mat::Zero(nr, nc);
sqSum = Mat::Zero(nr, nc);
for (auto &d: data)
{
mean += d;
sqSum += d.cwiseProduct(d);
}
stdDev = ((sqSum - mean.cwiseProduct(mean)/n)/(n - 1.)).cwiseSqrt();
mean /= n;
}
#define grid_printf(...) \
{\
char _buf[1024];\
sprintf(_buf, __VA_ARGS__);\
MSG << _buf;\
}
enum {sRead = 0, sWrite = 1, gRead = 2, gWrite = 3};
int main (int argc, char ** argv) int main (int argc, char ** argv)
{ {
#ifdef HAVE_LIME
Grid_init(&argc,&argv); Grid_init(&argc,&argv);
int64_t threads = GridThread::GetThreads(); int64_t threads = GridThread::GetThreads();
auto mpi = GridDefaultMpi();
unsigned int nVol = (BENCH_IO_LMAX - BENCH_IO_LMIN)/2 + 1;
unsigned int nRelVol = (BENCH_IO_LMAX - 24)/2 + 1;
std::vector<Eigen::MatrixXd> perf(BENCH_IO_NPASS, Eigen::MatrixXd::Zero(nVol, 4));
std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
std::vector<int> latt;
MSG << "Grid is setup to use " << threads << " threads" << std::endl; MSG << "Grid is setup to use " << threads << " threads" << std::endl;
MSG << "MPI partition " << mpi << std::endl; MSG << SEP << std::endl;
for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i) MSG << "Benchmark Lime write" << std::endl;
MSG << SEP << std::endl;
for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
{ {
MSG << BIGSEP << std::endl; auto mpi = GridDefaultMpi();
MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl; std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
MSG << BIGSEP << std::endl;
MSG << SEP << std::endl;
MSG << "Benchmark std write" << std::endl;
MSG << SEP << std::endl;
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
MSG << "-- Local volume " << l << "^4" << std::endl; std::cout << "-- Local volume " << l << "^4" << std::endl;
writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>); writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
}
MSG << SEP << std::endl;
MSG << "Benchmark std read" << std::endl;
MSG << SEP << std::endl;
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
MSG << "-- Local volume " << l << "^4" << std::endl;
readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
}
#ifdef HAVE_LIME
MSG << SEP << std::endl;
MSG << "Benchmark Grid C-Lime write" << std::endl;
MSG << SEP << std::endl;
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
MSG << "-- Local volume " << l << "^4" << std::endl;
writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
}
MSG << SEP << std::endl;
MSG << "Benchmark Grid C-Lime read" << std::endl;
MSG << SEP << std::endl;
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
MSG << "-- Local volume " << l << "^4" << std::endl;
readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
}
#endif
avPerf[i].fill(0.);
for (int f = 0; f < 4; ++f)
for (int l = 24; l <= BENCH_IO_LMAX; l += 2)
{
avPerf[i](f) += perf[i](volInd(l), f);
}
avPerf[i] /= nRelVol;
} }
Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4); MSG << "Benchmark Lime read" << std::endl;
Eigen::VectorXd avMean(4), avStdDev(4), avRob(4); MSG << SEP << std::endl;
double n = BENCH_IO_NPASS; for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
stats(mean, stdDev, perf);
stats(avMean, avStdDev, avPerf);
rob.fill(100.);
rob -= 100.*stdDev.cwiseQuotient(mean.cwiseAbs());
avRob.fill(100.);
avRob -= 100.*avStdDev.cwiseQuotient(avMean.cwiseAbs());
MSG << BIGSEP << std::endl;
MSG << "SUMMARY" << std::endl;
MSG << BIGSEP << std::endl;
MSG << "Summary of individual results (all results in MB/s)." << std::endl;
MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n",
"L", "std read", "std dev", "std write", "std dev",
"Grid read", "std dev", "Grid write", "std dev");
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{ {
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", auto mpi = GridDefaultMpi();
l, mean(volInd(l), sRead), stdDev(volInd(l), sRead), std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
mean(volInd(l), sWrite), stdDev(volInd(l), sWrite),
mean(volInd(l), gRead), stdDev(volInd(l), gRead), std::cout << "-- Local volume " << l << "^4" << std::endl;
mean(volInd(l), gWrite), stdDev(volInd(l), gWrite)); readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
} }
MSG << std::endl;
MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s\n",
"L", "std read", "std write", "Grid read", "Grid write");
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n",
l, rob(volInd(l), sRead), rob(volInd(l), sWrite),
rob(volInd(l), gRead), rob(volInd(l), gWrite));
}
MSG << std::endl;
MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl;
MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
MSG << std::endl;
grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n",
"std read", "std dev", "std write", "std dev",
"Grid read", "std dev", "Grid write", "std dev");
grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
MSG << std::endl;
MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
MSG << std::endl;
grid_printf("%12s %12s %12s %12s\n",
"std read", "std write", "Grid read", "Grid write");
grid_printf("%12.1f %12.1f %12.1f %12.1f\n",
avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite));
Grid_finalize(); Grid_finalize();
#endif
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -5,8 +5,6 @@
#ifdef HAVE_LIME #ifdef HAVE_LIME
#define MSG std::cout << GridLogMessage #define MSG std::cout << GridLogMessage
#define SEP \ #define SEP \
"-----------------------------------------------------------------------------"
#define BIGSEP \
"=============================================================================" "============================================================================="
namespace Grid { namespace Grid {
@ -16,152 +14,13 @@ using WriterFn = std::function<void(const std::string, Field &)> ;
template <typename Field> template <typename Field>
using ReaderFn = std::function<void(Field &, const std::string)>; using ReaderFn = std::function<void(Field &, const std::string)>;
// AP 06/10/2020: Standard C version in case one is suspicious of the C++ API
//
// template <typename Field>
// void stdWrite(const std::string filestem, Field &vec)
// {
// std::string rankStr = std::to_string(vec.Grid()->ThisRank());
// std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
// size_t size;
// uint32_t crc;
// GridStopWatch ioWatch, crcWatch;
// size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
// autoView(vec_v, vec, CpuRead);
// crcWatch.Start();
// crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
// std::fwrite(&crc, sizeof(uint32_t), 1, file);
// crcWatch.Stop();
// MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
// ioWatch.Start();
// std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
// ioWatch.Stop();
// std::fclose(file);
// size *= vec.Grid()->ProcessorCount();
// auto &p = BinaryIO::lastPerf;
// p.size = size;
// p.time = ioWatch.useconds();
// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
// MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
// << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
// MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
// }
//
// template <typename Field>
// void stdRead(Field &vec, const std::string filestem)
// {
// std::string rankStr = std::to_string(vec.Grid()->ThisRank());
// std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
// size_t size;
// uint32_t crcRead, crcData;
// GridStopWatch ioWatch, crcWatch;
// size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
// crcWatch.Start();
// std::fread(&crcRead, sizeof(uint32_t), 1, file);
// crcWatch.Stop();
// {
// autoView(vec_v, vec, CpuWrite);
// ioWatch.Start();
// std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
// ioWatch.Stop();
// std::fclose(file);
// }
// {
// autoView(vec_v, vec, CpuRead);
// crcWatch.Start();
// crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
// crcWatch.Stop();
// }
// MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
// assert(crcData == crcRead);
// size *= vec.Grid()->ProcessorCount();
// auto &p = BinaryIO::lastPerf;
// p.size = size;
// p.time = ioWatch.useconds();
// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
// MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
// << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
// MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
// }
template <typename Field>
void stdWrite(const std::string filestem, Field &vec)
{
std::string rankStr = std::to_string(vec.Grid()->ThisRank());
std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary);
size_t size, sizec;
uint32_t crc;
GridStopWatch ioWatch, crcWatch;
size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
sizec = size/sizeof(char); // just in case of...
autoView(vec_v, vec, CpuRead);
crcWatch.Start();
crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t)/sizeof(char));
crcWatch.Stop();
MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
ioWatch.Start();
file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
file.flush();
ioWatch.Stop();
size *= vec.Grid()->ProcessorCount();
auto &p = BinaryIO::lastPerf;
p.size = size;
p.time = ioWatch.useconds();
p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
}
template <typename Field>
void stdRead(Field &vec, const std::string filestem)
{
std::string rankStr = std::to_string(vec.Grid()->ThisRank());
std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary);
size_t size, sizec;
uint32_t crcRead, crcData;
GridStopWatch ioWatch, crcWatch;
size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
sizec = size/sizeof(char); // just in case of...
crcWatch.Start();
file.read(reinterpret_cast<char *>(&crcRead), sizeof(uint32_t)/sizeof(char));
crcWatch.Stop();
{
autoView(vec_v, vec, CpuWrite);
ioWatch.Start();
file.read(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
ioWatch.Stop();
}
{
autoView(vec_v, vec, CpuRead);
crcWatch.Start();
crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
crcWatch.Stop();
}
MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
assert(crcData == crcRead);
size *= vec.Grid()->ProcessorCount();
auto &p = BinaryIO::lastPerf;
p.size = size;
p.time = ioWatch.useconds();
p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
}
template <typename Field> template <typename Field>
void limeWrite(const std::string filestem, Field &vec) void limeWrite(const std::string filestem, Field &vec)
{ {
emptyUserRecord record; emptyUserRecord record;
ScidacWriter binWriter(vec.Grid()->IsBoss()); ScidacWriter binWriter(vec.Grid()->IsBoss());
binWriter.open(filestem + ".lime.bin"); binWriter.open(filestem + ".bin");
binWriter.writeScidacFieldRecord(vec, record); binWriter.writeScidacFieldRecord(vec, record);
binWriter.close(); binWriter.close();
} }
@ -172,7 +31,7 @@ void limeRead(Field &vec, const std::string filestem)
emptyUserRecord record; emptyUserRecord record;
ScidacReader binReader; ScidacReader binReader;
binReader.open(filestem + ".lime.bin"); binReader.open(filestem + ".bin");
binReader.readScidacFieldRecord(vec, record); binReader.readScidacFieldRecord(vec, record);
binReader.close(); binReader.close();
} }
@ -214,18 +73,12 @@ void writeBenchmark(const Coordinate &latt, const std::string filename,
auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
std::shared_ptr<GridBase> gPt; std::shared_ptr<GridBase> gPt;
std::random_device rd;
makeGrid(gPt, gBasePt, Ls, rb); makeGrid(gPt, gBasePt, Ls, rb);
GridBase *g = gPt.get(); GridBase *g = gPt.get();
GridParallelRNG rng(g); GridParallelRNG rng(g);
Field vec(g); Field vec(g);
rng.SeedFixedIntegers({static_cast<int>(rd()), static_cast<int>(rd()),
static_cast<int>(rd()), static_cast<int>(rd()),
static_cast<int>(rd()), static_cast<int>(rd()),
static_cast<int>(rd()), static_cast<int>(rd())});
random(rng, vec); random(rng, vec);
write(filename, vec); write(filename, vec);
@ -243,8 +96,8 @@ void readBenchmark(const Coordinate &latt, const std::string filename,
makeGrid(gPt, gBasePt, Ls, rb); makeGrid(gPt, gBasePt, Ls, rb);
GridBase *g = gPt.get(); GridBase *g = gPt.get();
Field vec(g); Field vec(g);
read(vec, filename); read(vec, filename);
} }

Some files were not shown because too many files have changed in this diff Show More