diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index 9556c03d..c62d9cdb 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -34,6 +34,12 @@ #define __SYCL__REDEFINE__ #endif +/* HIP save and restore compile environment*/ +#ifdef GRID_HIP +#pragma push +#pragma push_macro("__HIP_DEVICE_COMPILE__") +#endif +#define EIGEN_NO_HIP #include #include @@ -42,7 +48,7 @@ #ifdef __NVCC__REDEFINE__ #pragma pop_macro("__CUDACC__") #pragma pop_macro("__NVCC__") -#pragma pop_macro("GRID_SIMT") +#pragma pop_macro("__CUDA_ARCH__") #pragma pop #endif @@ -52,6 +58,12 @@ #pragma pop #endif +/*HIP restore*/ +#ifdef __HIP__REDEFINE__ +#pragma pop_macro("__HIP_DEVICE_COMPILE__") +#pragma pop +#endif + #if defined __GNUC__ #pragma GCC diagnostic pop #endif diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index ebb3162b..249732fb 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -65,8 +65,7 @@ public: MemoryManager::CpuFree((void *)__p,bytes); } - // FIXME: hack for the copy constructor, eventually it must be avoided - //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; + // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop void construct(pointer __p, const _Tp& __val) { assert(0);}; void construct(pointer __p) { }; void destroy(pointer __p) { }; @@ -74,6 +73,9 @@ public: template inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } template inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } +////////////////////////////////////////////////////////////////////////////////////// +// Unified virtual memory +////////////////////////////////////////////////////////////////////////////////////// template class uvmAllocator { public: @@ -109,22 +111,63 @@ public: MemoryManager::SharedFree((void *)__p,bytes); } - // FIXME: hack for the copy constructor, eventually it must be avoided void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; - //void construct(pointer __p, const _Tp& __val) { }; void construct(pointer __p) { }; void destroy(pointer __p) { }; }; template inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; } template inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; } +//////////////////////////////////////////////////////////////////////////////// +// Device memory +//////////////////////////////////////////////////////////////////////////////// +template +class devAllocator { +public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef _Tp* pointer; + typedef const _Tp* const_pointer; + typedef _Tp& reference; + typedef const _Tp& const_reference; + typedef _Tp value_type; + + template struct rebind { typedef devAllocator<_Tp1> other; }; + devAllocator() throw() { } + devAllocator(const devAllocator&) throw() { } + template devAllocator(const devAllocator<_Tp1>&) throw() { } + ~devAllocator() throw() { } + pointer address(reference __x) const { return &__x; } + size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); } + + pointer allocate(size_type __n, const void* _p= 0) + { + size_type bytes = __n*sizeof(_Tp); + profilerAllocate(bytes); + _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); + assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); + return ptr; + } + + void deallocate(pointer __p, size_type __n) + { + size_type bytes = __n * sizeof(_Tp); + profilerFree(bytes); + MemoryManager::AcceleratorFree((void *)__p,bytes); + } + void construct(pointer __p, const _Tp& __val) { }; + void construct(pointer __p) { }; + void destroy(pointer __p) { }; +}; +template inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; } +template inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; } + //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -template using commAllocator = uvmAllocator; +//template using commAllocator = devAllocator; template using Vector = std::vector >; -template using commVector = std::vector >; -//template using Matrix = std::vector > >; +template using commVector = std::vector >; NAMESPACE_END(Grid); diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index c43bc33a..bfa06938 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -138,11 +138,20 @@ void MemoryManager::Init(void) Ncache[SharedSmall]=Nc; } } + +} + +void MemoryManager::InitMessage(void) { + +#ifndef GRID_UVM + std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "< &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes); - - void SendToRecvFromComplete(std::vector &waitall); - double StencilSendToRecvFrom(void *xmit, int xmit_to_rank, void *recv, diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 0e525674..83f71233 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/communicator/Communicator_mpi.cc @@ -35,7 +35,7 @@ Grid_MPI_Comm CartesianCommunicator::communicator_world; //////////////////////////////////////////// // First initialise of comms system //////////////////////////////////////////// -void CartesianCommunicator::Init(int *argc, char ***argv) +void CartesianCommunicator::Init(int *argc, char ***argv) { int flag; @@ -43,8 +43,16 @@ void CartesianCommunicator::Init(int *argc, char ***argv) MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { - MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); +#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori + nCommThreads=1; + // wrong results here too + // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs + // other comms schemes are ok + MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided); +#else + MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); +#endif //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { assert(0); @@ -91,7 +99,7 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor) //////////////////////////////////////////////////////////////////////////////////////////////////////// // Initialises from communicator_world //////////////////////////////////////////////////////////////////////////////////////////////////////// -CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) +CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) { MPI_Comm optimal_comm; //////////////////////////////////////////////////// @@ -110,7 +118,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) ////////////////////////////////// // Try to subdivide communicator ////////////////////////////////// -CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) +CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) { _ndimension = processors.size(); assert(_ndimension>=1); int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); @@ -127,7 +135,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const ////////////////////////////////////////////////////////////////////////////////////////////////////// // split the communicator ////////////////////////////////////////////////////////////////////////////////////////////////////// - // int Nparent = parent._processors ; + // int Nparent = parent._processors ; int Nparent; MPI_Comm_size(parent.communicator,&Nparent); @@ -149,13 +157,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const } // rank within subcomm ; srank is rank of subcomm within blocks of subcomms - int crank; + int crank; // Mpi uses the reverse Lexico convention to us; so reversed routines called Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids MPI_Comm comm_split; - if ( Nchild > 1 ) { + if ( Nchild > 1 ) { //////////////////////////////////////////////////////////////// // Split the communicator @@ -180,11 +188,11 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const SetCommunicator(comm_split); /////////////////////////////////////////////// - // Free the temp communicator + // Free the temp communicator /////////////////////////////////////////////// MPI_Comm_free(&comm_split); - if(0){ + if(0){ std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl; for(int d=0;d reqs(0); - // unsigned long xcrc = crc32(0L, Z_NULL, 0); - // unsigned long rcrc = crc32(0L, Z_NULL, 0); - // xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); - SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); - SendToRecvFromComplete(reqs); - // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); - // printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc); -} -void CartesianCommunicator::SendRecvPacket(void *xmit, - void *recv, - int sender, - int receiver, - int bytes) -{ - MPI_Status stat; - assert(sender != receiver); - int tag = sender; - if ( _processor == sender ) { - MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); - } - if ( _processor == receiver ) { - MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); - } -} -// Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ + unsigned long xcrc = crc32(0L, Z_NULL, 0); + unsigned long rcrc = crc32(0L, Z_NULL, 0); + int myrank = _processor; int ierr; - if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { - MPI_Request xrq; - MPI_Request rrq; + // Enforce no UVM in comms, device or host OK + assert(acceleratorIsCommunicable(xmit)); + assert(acceleratorIsCommunicable(recv)); - ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); - ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); - - assert(ierr==0); - list.push_back(xrq); - list.push_back(rrq); - } else { - // Give the CPU to MPI immediately; can use threads to overlap optionally - ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, - recv,bytes,MPI_CHAR,from, from, - communicator,MPI_STATUS_IGNORE); - assert(ierr==0); - } + // Give the CPU to MPI immediately; can use threads to overlap optionally + // printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); + ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, + recv,bytes,MPI_CHAR,from, from, + communicator,MPI_STATUS_IGNORE); + assert(ierr==0); + + // xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); + // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); + // printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush } - +// Basic Halo comms primitive double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int dest, void *recv, @@ -367,7 +343,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorStencilSendToRecvFromComplete(list,dir); } return off_node_bytes; } -void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall,int dir) -{ - SendToRecvFromComplete(waitall); -} -void CartesianCommunicator::StencilBarrier(void) -{ - MPI_Barrier (ShmComm); -} -void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) { int nreq=list.size(); @@ -422,6 +390,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector & assert(ierr==0); list.resize(0); } +void CartesianCommunicator::StencilBarrier(void) +{ + MPI_Barrier (ShmComm); +} +//void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) +//{ +//} void CartesianCommunicator::Barrier(void) { int ierr = MPI_Barrier(communicator); @@ -436,8 +411,8 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) communicator); assert(ierr==0); } -int CartesianCommunicator::RankWorld(void){ - int r; +int CartesianCommunicator::RankWorld(void){ + int r; MPI_Comm_rank(communicator_world,&r); return r; } @@ -470,7 +445,7 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. // (Turns up on 32^3 x 64 Gparity too) MPI_Datatype object; - int iwords; + int iwords; int ibytes; iwords = words; ibytes = bytes; @@ -483,5 +458,3 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t } NAMESPACE_END(Grid); - - diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 81900371..6cb431a2 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){} void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){} -void CartesianCommunicator::SendRecvPacket(void *xmit, - void *recv, - int xmit_to_rank, - int recv_from_rank, - int bytes) -{ - assert(0); -} - // Basic Halo comms primitive -- should never call in single node void CartesianCommunicator::SendToRecvFrom(void *xmit, @@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, { assert(0); } -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - assert(0); -} - -void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) -{ - assert(0); -} void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) { bcopy(in,out,bytes*words); @@ -137,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int recv_from_rank, int bytes, int dir) { - std::vector list; - // Discard the "dir" - SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); - SendToRecvFromComplete(list); return 2.0*bytes; } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, @@ -150,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &waitall,int dir) { - SendToRecvFromComplete(waitall); } void CartesianCommunicator::StencilBarrier(void){}; diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 45fefc71..0cbde9eb 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -32,6 +32,9 @@ Author: Peter Boyle #ifdef GRID_CUDA #include #endif +#ifdef GRID_HIP +#include +#endif NAMESPACE_BEGIN(Grid); #define header "SharedMemoryMpi: " @@ -47,7 +50,12 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) ///////////////////////////////////////////////////////////////////// // Split into groups that can share memory ///////////////////////////////////////////////////////////////////// +#ifndef GRID_MPI3_SHM_NONE MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); +#else + MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm); +#endif + MPI_Comm_rank(WorldShmComm ,&WorldShmRank); MPI_Comm_size(WorldShmComm ,&WorldShmSize); @@ -420,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) //////////////////////////////////////////////////////////////////////////////////////////// // Hugetlbfs mapping intended //////////////////////////////////////////////////////////////////////////////////////////// -#ifdef GRID_CUDA +#if defined(GRID_CUDA) ||defined(GRID_HIP) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) { void * ShmCommBuf ; @@ -443,17 +451,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) /////////////////////////////////////////////////////////////////////////////////////////////////////////// // Each MPI rank should allocate our own buffer /////////////////////////////////////////////////////////////////////////////////////////////////////////// - auto err = cudaMalloc(&ShmCommBuf, bytes); - if ( err != cudaSuccess) { - std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " < NAMESPACE_BEGIN(Grid); -template -auto Cshift(const LatticeUnaryExpression &expr,int dim,int shift) - -> Lattice -{ - return Cshift(closure(expr),dim,shift); -} -template -auto Cshift(const LatticeBinaryExpression &expr,int dim,int shift) - -> Lattice -{ - return Cshift(closure(expr),dim,shift); -} -template -auto Cshift(const LatticeTrinaryExpression &expr,int dim,int shift) - -> Lattice +template::value,void>::type * = nullptr> +auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) { return Cshift(closure(expr),dim,shift); } diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index 4de2bbe2..b0dd068d 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -76,8 +76,8 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen autoView(rhs_v , rhs, AcceleratorRead); auto buffer_p = & buffer[0]; auto table = &Cshift_table[0]; - accelerator_for(i,ent,1,{ - buffer_p[table[i].first]=rhs_v[table[i].second]; + accelerator_for(i,ent,vobj::Nsimd(),{ + coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); }); } } @@ -185,8 +185,8 @@ template void Scatter_plane_simple (Lattice &rhs,commVector void Scatter_plane_merge(Lattice &rhs,ExtractPointerA if(cbmask ==0x3 ) { autoView( rhs_v , rhs, AcceleratorWrite); + int _slice_stride = rhs.Grid()->_slice_stride[dimension]; + int _slice_block = rhs.Grid()->_slice_block[dimension]; accelerator_for2d(n,e1,b,e2,1,{ - int o = n*rhs.Grid()->_slice_stride[dimension]; - int offset = b+n*rhs.Grid()->_slice_block[dimension]; + int o = n*_slice_stride; + int offset = b+n*_slice_block; merge(rhs_v[so+o+b],pointers,offset); }); } else { @@ -220,6 +222,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA // Test_cshift_red_black code. // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout<<" Unthreaded warning -- buffer is not densely packed ??"< void Copy_plane(Lattice& lhs,const Lattice &rhs autoView(rhs_v , rhs, AcceleratorRead); autoView(lhs_v , lhs, AcceleratorWrite); auto table = &Cshift_table[0]; - accelerator_for(i,ent,1,{ - lhs_v[table[i].first]=rhs_v[table[i].second]; + accelerator_for(i,ent,vobj::Nsimd(),{ + coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); }); } } diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index a3017198..28ea0294 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -37,6 +37,7 @@ Author: Peter Boyle #include #include //#include +#include #include #include #include diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index 91b456d9..c43844f8 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -42,9 +42,24 @@ NAMESPACE_BEGIN(Grid); //////////////////////////////////////////////////// // Predicated where support //////////////////////////////////////////////////// +#ifdef GRID_SIMT +// drop to scalar in SIMT; cleaner in fact template -accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, - const robj &iffalse) { +accelerator_inline vobj predicatedWhere(const iobj &predicate, + const vobj &iftrue, + const robj &iffalse) +{ + Integer mask = TensorRemove(predicate); + typename std::remove_const::type ret= iffalse; + if (mask) ret=iftrue; + return ret; +} +#else +template +accelerator_inline vobj predicatedWhere(const iobj &predicate, + const vobj &iftrue, + const robj &iffalse) +{ typename std::remove_const::type ret; typedef typename vobj::scalar_object scalar_object; @@ -68,6 +83,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru merge(ret, falsevals); return ret; } +#endif ///////////////////////////////////////////////////// //Specialization of getVectorType for lattices @@ -81,32 +97,62 @@ struct getVectorType >{ //-- recursive evaluation of expressions; -- // handle leaves of syntax tree /////////////////////////////////////////////////// -template accelerator_inline +template::value&&!is_lattice_expr::value,sobj>::type * = nullptr> +accelerator_inline sobj eval(const uint64_t ss, const sobj &arg) { return arg; } - template accelerator_inline -const lobj & eval(const uint64_t ss, const LatticeView &arg) +auto eval(const uint64_t ss, const LatticeView &arg) -> decltype(arg(ss)) +{ + return arg(ss); +} + +//////////////////////////////////////////// +//-- recursive evaluation of expressions; -- +// whole vector return, used only for expression return type inference +/////////////////////////////////////////////////// +template accelerator_inline +sobj vecEval(const uint64_t ss, const sobj &arg) +{ + return arg; +} +template accelerator_inline +const lobj & vecEval(const uint64_t ss, const LatticeView &arg) { return arg[ss]; } -// What needs this? -// Cannot be legal on accelerator -// Comparison must convert -#if 1 -template accelerator_inline -const lobj & eval(const uint64_t ss, const Lattice &arg) -{ - auto view = arg.View(AcceleratorRead); - return view[ss]; -} -#endif - /////////////////////////////////////////////////// // handle nodes in syntax tree- eval one operand +// vecEval needed (but never called as all expressions offloaded) to infer the return type +// in SIMT contexts of closure. +/////////////////////////////////////////////////// +template accelerator_inline +auto vecEval(const uint64_t ss, const LatticeUnaryExpression &expr) + -> decltype(expr.op.func( vecEval(ss, expr.arg1))) +{ + return expr.op.func( vecEval(ss, expr.arg1) ); +} +// vecEval two operands +template accelerator_inline +auto vecEval(const uint64_t ss, const LatticeBinaryExpression &expr) + -> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2))) +{ + return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) ); +} +// vecEval three operands +template accelerator_inline +auto vecEval(const uint64_t ss, const LatticeTrinaryExpression &expr) + -> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3))) +{ + return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)); +} + +/////////////////////////////////////////////////// +// handle nodes in syntax tree- eval one operand coalesced /////////////////////////////////////////////////// template accelerator_inline auto eval(const uint64_t ss, const LatticeUnaryExpression &expr) @@ -114,23 +160,41 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression &expr) { return expr.op.func( eval(ss, expr.arg1) ); } -/////////////////////// // eval two operands -/////////////////////// template accelerator_inline auto eval(const uint64_t ss, const LatticeBinaryExpression &expr) -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2))) { return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) ); } -/////////////////////// // eval three operands -/////////////////////// template accelerator_inline auto eval(const uint64_t ss, const LatticeTrinaryExpression &expr) - -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3))) + -> decltype(expr.op.func(eval(ss, expr.arg1), + eval(ss, expr.arg2), + eval(ss, expr.arg3))) { - return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)); +#ifdef GRID_SIMT + // Handles Nsimd (vInteger) != Nsimd(ComplexD) + typedef decltype(vecEval(ss, expr.arg2)) rvobj; + typedef typename std::remove_reference::type vobj; + + const int Nsimd = vobj::vector_type::Nsimd(); + + auto vpred = vecEval(ss,expr.arg1); + + ExtractBuffer mask(Nsimd); + extract(TensorRemove(vpred), mask); + + int s = acceleratorSIMTlane(Nsimd); + return expr.op.func(mask[s], + eval(ss, expr.arg2), + eval(ss, expr.arg3)); +#else + return expr.op.func(eval(ss, expr.arg1), + eval(ss, expr.arg2), + eval(ss, expr.arg3)); +#endif } ////////////////////////////////////////////////////////////////////////// @@ -228,7 +292,7 @@ template inline void ExpressionViewOpen(LatticeBinaryExpression &expr) { ExpressionViewOpen(expr.arg1); // recurse AST - ExpressionViewOpen(expr.arg2); // recurse AST + ExpressionViewOpen(expr.arg2); // rrecurse AST } template inline void ExpressionViewOpen(LatticeTrinaryExpression &expr) @@ -272,9 +336,8 @@ inline void ExpressionViewClose(LatticeTrinaryExpression &expr) // Unary operators and funcs //////////////////////////////////////////// #define GridUnopClass(name, ret) \ - template \ struct name { \ - static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \ + template static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \ }; GridUnopClass(UnarySub, -a); @@ -285,8 +348,6 @@ GridUnopClass(UnaryTrace, trace(a)); GridUnopClass(UnaryTranspose, transpose(a)); GridUnopClass(UnaryTa, Ta(a)); GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); -GridUnopClass(UnaryReal, real(a)); -GridUnopClass(UnaryImag, imag(a)); GridUnopClass(UnaryToReal, toReal(a)); GridUnopClass(UnaryToComplex, toComplex(a)); GridUnopClass(UnaryTimesI, timesI(a)); @@ -305,10 +366,10 @@ GridUnopClass(UnaryExp, exp(a)); // Binary operators //////////////////////////////////////////// #define GridBinOpClass(name, combination) \ - template \ struct name { \ + template \ static auto accelerator_inline \ - func(const left &lhs, const right &rhs) \ + func(const _left &lhs, const _right &rhs) \ -> decltype(combination) const \ { \ return combination; \ @@ -328,10 +389,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs); // Trinary conditional op //////////////////////////////////////////////////// #define GridTrinOpClass(name, combination) \ - template \ struct name { \ + template \ static auto accelerator_inline \ - func(const predicate &pred, const left &lhs, const right &rhs) \ + func(const _predicate &pred, const _left &lhs, const _right &rhs) \ -> decltype(combination) const \ { \ return combination; \ @@ -339,17 +400,17 @@ GridBinOpClass(BinaryOrOr, lhs || rhs); }; GridTrinOpClass(TrinaryWhere, - (predicatedWhere::type, - typename std::remove_reference::type>(pred, lhs,rhs))); + (predicatedWhere< + typename std::remove_reference<_predicate>::type, + typename std::remove_reference<_left>::type, + typename std::remove_reference<_right>::type>(pred, lhs,rhs))); //////////////////////////////////////////// // Operator syntactical glue //////////////////////////////////////////// - -#define GRID_UNOP(name) name -#define GRID_BINOP(name) name -#define GRID_TRINOP(name) name +#define GRID_UNOP(name) name +#define GRID_BINOP(name) name +#define GRID_TRINOP(name) name #define GRID_DEF_UNOP(op, name) \ template ::value||is_lattice_expr::value,T1>::type * = nullptr> \ @@ -401,8 +462,6 @@ GRID_DEF_UNOP(trace, UnaryTrace); GRID_DEF_UNOP(transpose, UnaryTranspose); GRID_DEF_UNOP(Ta, UnaryTa); GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); -GRID_DEF_UNOP(real, UnaryReal); -GRID_DEF_UNOP(imag, UnaryImag); GRID_DEF_UNOP(toReal, UnaryToReal); GRID_DEF_UNOP(toComplex, UnaryToComplex); GRID_DEF_UNOP(timesI, UnaryTimesI); @@ -435,29 +494,36 @@ GRID_DEF_TRINOP(where, TrinaryWhere); ///////////////////////////////////////////////////////////// template auto closure(const LatticeUnaryExpression &expr) - -> Lattice + -> Lattice { - Lattice ret(expr); + Lattice ret(expr); return ret; } template auto closure(const LatticeBinaryExpression &expr) - -> Lattice + -> Lattice { - Lattice ret(expr); + Lattice ret(expr); return ret; } template auto closure(const LatticeTrinaryExpression &expr) - -> Lattice + -> Lattice { - Lattice ret(expr); + Lattice ret(expr); return ret; } +#define EXPRESSION_CLOSURE(function) \ + template::value,void>::type * = nullptr> \ + auto function(Expression &expr) -> decltype(function(closure(expr))) \ + { \ + return function(closure(expr)); \ + } + #undef GRID_UNOP #undef GRID_BINOP diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index a3ae1f28..3c269c58 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -60,9 +60,9 @@ void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ autoView( lhs_v , lhs, AcceleratorRead); autoView( rhs_v , rhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ - decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); auto rhs_t=rhs_v(ss); + auto tmp =ret_v(ss); mac(&tmp,&lhs_t,&rhs_t); coalescedWrite(ret_v[ss],tmp); }); @@ -124,7 +124,7 @@ void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ autoView( ret_v , ret, AcceleratorWrite); autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ - decltype(coalescedRead(obj1())) tmp; + auto tmp =ret_v(ss); auto lhs_t=lhs_v(ss); mac(&tmp,&lhs_t,&rhs); coalescedWrite(ret_v[ss],tmp); @@ -182,7 +182,7 @@ void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ autoView( ret_v , ret, AcceleratorWrite); autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ - decltype(coalescedRead(obj1())) tmp; + auto tmp =ret_v(ss); auto rhs_t=rhs_v(ss); mac(&tmp,&lhs,&rhs_t); coalescedWrite(ret_v[ss],tmp); diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 73b1b6a1..3ad9f913 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -123,9 +123,9 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); auto me = View(AcceleratorWriteDiscard); - accelerator_for(ss,me.size(),1,{ + accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); - vstream(me[ss],tmp); + coalescedWrite(me[ss],tmp); }); me.ViewClose(); ExpressionViewClose(exprCopy); @@ -146,9 +146,9 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); auto me = View(AcceleratorWriteDiscard); - accelerator_for(ss,me.size(),1,{ + accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); - vstream(me[ss],tmp); + coalescedWrite(me[ss],tmp); }); me.ViewClose(); ExpressionViewClose(exprCopy); @@ -168,9 +168,9 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); auto me = View(AcceleratorWriteDiscard); - accelerator_for(ss,me.size(),1,{ + accelerator_for(ss,me.size(),vobj::Nsimd(),{ auto tmp = eval(ss,exprCopy); - vstream(me[ss],tmp); + coalescedWrite(me[ss],tmp); }); me.ViewClose(); ExpressionViewClose(exprCopy); diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index 9f1155eb..af9d7280 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -54,13 +54,34 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) typedef decltype(basis[0].View(AcceleratorRead)) View; Vector basis_v; basis_v.reserve(basis.size()); + typedef typename std::remove_reference::type vobj; + typedef typename std::remove_reference::type Coeff_t; GridBase* grid = basis[0].Grid(); for(int k=0;k Bt(Nm * max_threads); + thread_region + { + vobj* B = &Bt[Nm * thread_num()]; + thread_for_in_region(ss, grid->oSites(),{ + for(int j=j0; joSites(); uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead - typedef typename std::remove_reference::type vobj; - Vector Bt(siteBlock * nrot); auto Bp=&Bt[0]; // GPU readable copy of matrix - Vector Qt_jv(Nm*Nm); - double *Qt_p = & Qt_jv[0]; + Vector Qt_jv(Nm*Nm); + Coeff_t *Qt_p = & Qt_jv[0]; thread_for(i,Nm*Nm,{ int j = i/Nm; int k = i%Nm; @@ -118,6 +137,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j])); }); } +#endif for(int k=0;k vPredicate ; -/* -template accelerator_inline -vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse) -{ - typename std::remove_const::type ret; - - typedef typename vobj::scalar_object scalar_object; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - const int Nsimd = vobj::vector_type::Nsimd(); - - ExtractBuffer mask(Nsimd); - ExtractBuffer truevals(Nsimd); - ExtractBuffer falsevals(Nsimd); - - extract(iftrue, truevals); - extract(iffalse, falsevals); - extract(TensorRemove(predicate), mask); - - for (int s = 0; s < Nsimd; s++) { - if (mask[s]) falsevals[s] = truevals[s]; - } - - merge(ret, falsevals); - return ret; -} -*/ ////////////////////////////////////////////////////////////////////////// // compare lattice to lattice ////////////////////////////////////////////////////////////////////////// diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h index c79becf2..5caab214 100644 --- a/Grid/lattice/Lattice_peekpoke.h +++ b/Grid/lattice/Lattice_peekpoke.h @@ -182,6 +182,14 @@ inline void peekLocalSite(sobj &s,const LatticeView &l,Coordinate &site) return; }; +template +inline void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site) +{ + autoView(lv,l,CpuRead); + peekLocalSite(s,lv,site); + return; +}; + // Must be CPU write view template inline void pokeLocalSite(const sobj &s,LatticeView &l,Coordinate &site) @@ -210,6 +218,14 @@ inline void pokeLocalSite(const sobj &s,LatticeView &l,Coordinate &site) return; }; +template +inline void pokeLocalSite(const sobj &s, Lattice &l,Coordinate &site) +{ + autoView(lv,l,CpuWrite); + pokeLocalSite(s,lv,site); + return; +}; + NAMESPACE_END(Grid); #endif diff --git a/Grid/lattice/Lattice_real_imag.h b/Grid/lattice/Lattice_real_imag.h new file mode 100644 index 00000000..003300cc --- /dev/null +++ b/Grid/lattice/Lattice_real_imag.h @@ -0,0 +1,79 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/lattice/Lattice_reality.h + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle +Author: neo + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_LATTICE_REAL_IMAG_H +#define GRID_LATTICE_REAL_IMAG_H + + +// FIXME .. this is the sector of the code +// I am most worried about the directions +// The choice of burying complex in the SIMD +// is making the use of "real" and "imag" very cumbersome + +NAMESPACE_BEGIN(Grid); + +template inline Lattice real(const Lattice &lhs){ + Lattice ret(lhs.Grid()); + + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + + ret.Checkerboard()=lhs.Checkerboard(); + accelerator_for( ss, lhs_v.size(), 1, { + ret_v[ss] =real(lhs_v[ss]); + }); + return ret; +}; +template inline Lattice imag(const Lattice &lhs){ + Lattice ret(lhs.Grid()); + + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + + ret.Checkerboard()=lhs.Checkerboard(); + accelerator_for( ss, lhs_v.size(), 1, { + ret_v[ss] =imag(lhs_v[ss]); + }); + return ret; +}; + +template::value,void>::type * = nullptr> + auto real(const Expression &expr) -> decltype(real(closure(expr))) +{ + return real(closure(expr)); +} +template::value,void>::type * = nullptr> + auto imag(const Expression &expr) -> decltype(imag(closure(expr))) +{ + return imag(closure(expr)); +} + +NAMESPACE_END(Grid); + +#endif diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 5f490507..d8a47ae1 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid); #ifdef GRID_HIP extern hipDeviceProp_t *gpu_props; +#define WARP_SIZE 64 #endif #ifdef GRID_CUDA extern cudaDeviceProp *gpu_props; +#define WARP_SIZE 32 #endif -#define WARP_SIZE 32 __device__ unsigned int retirementCount = 0; template @@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid // cannot use overloaded operators for sobj as they are not volatile-qualified memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj)); - __syncwarp(); + acceleratorSynchronise(); const Iterator VEC = WARP_SIZE; const Iterator vid = tid & (VEC-1); @@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid beta += temp; memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj)); } - __syncwarp(); + acceleratorSynchronise(); } - __syncthreads(); + acceleratorSynchroniseAll(); if (threadIdx.x == 0) { beta = Zero(); @@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid } memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj)); } - __syncthreads(); + acceleratorSynchroniseAll(); } diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index beceecc9..e698e40e 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -240,6 +240,8 @@ template autoView( fineX_ , fineX, AcceleratorRead); autoView( fineY_ , fineY, AcceleratorRead); autoView( coarseA_, coarseA, AcceleratorRead); + Coordinate fine_rdimensions = fine->_rdimensions; + Coordinate coarse_rdimensions = coarse->_rdimensions; accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { @@ -247,9 +249,9 @@ template Coordinate coor_c(_ndimension); Coordinate coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); + Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions); for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions); // z = A x + y #ifdef GRID_SIMT @@ -353,11 +355,14 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( fineData_ , fineData, AcceleratorRead); + Coordinate fine_rdimensions = fine->_rdimensions; + Coordinate coarse_rdimensions = coarse->_rdimensions; + accelerator_for(sc,coarse->oSites(),1,{ // One thread per sub block Coordinate coor_c(_ndimension); - Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate + Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate coarseData_[sc]=Zero(); for(int sb=0;sb &coarseData,const Lattice &fineData) Coordinate coor_f(_ndimension); Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d]; - Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions); + Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions); coarseData_[sc]=coarseData_[sc]+fineData_[sf]; } diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 625eda63..ca660610 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -208,7 +208,7 @@ public: LebesgueOrder LebesgueEvenOdd; // Comms buffer - std::vector > comm_buf; + // std::vector > comm_buf; /////////////////////////////////////////////////////////////// // Conserved current utilities diff --git a/Grid/qcd/action/fermion/StaggeredKernels.h b/Grid/qcd/action/fermion/StaggeredKernels.h index 30deee06..d67105bb 100644 --- a/Grid/qcd/action/fermion/StaggeredKernels.h +++ b/Grid/qcd/action/fermion/StaggeredKernels.h @@ -63,17 +63,20 @@ template class StaggeredKernels : public FermionOperator , pub /////////////////////////////////////////////////////////////////////////////////////// // Generic Nc kernels /////////////////////////////////////////////////////////////////////////////////////// - template accelerator_inline + template + static accelerator_inline void DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - template accelerator_inline + + template static accelerator_inline void DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - template accelerator_inline + + template static accelerator_inline void DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, @@ -82,17 +85,20 @@ template class StaggeredKernels : public FermionOperator , pub /////////////////////////////////////////////////////////////////////////////////////// // Nc=3 specific kernels /////////////////////////////////////////////////////////////////////////////////////// - template accelerator_inline + + template static accelerator_inline void DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - template accelerator_inline + + template static accelerator_inline void DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - template accelerator_inline + + template static accelerator_inline void DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, @@ -101,6 +107,7 @@ template class StaggeredKernels : public FermionOperator , pub /////////////////////////////////////////////////////////////////////////////////////// // Asm Nc=3 specific kernels /////////////////////////////////////////////////////////////////////////////////////// + void DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index 1c4dd3cf..bf8926d0 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -50,14 +50,14 @@ public: double, nu); WilsonAnisotropyCoefficients(): - isAnisotropic(false), - t_direction(Nd-1), - xi_0(1.0), + isAnisotropic(false), + t_direction(Nd-1), + xi_0(1.0), nu(1.0){} }; template -class WilsonFermion : public WilsonKernels, public WilsonFermionStatic +class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { public: INHERIT_IMPL_TYPES(Impl); @@ -74,6 +74,20 @@ public: FermionField _tmp; FermionField &tmp(void) { return _tmp; } + void Report(void); + void ZeroCounters(void); + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; + double DhopTotalTime; + + double DerivCalls; + double DerivCommTime; + double DerivComputeTime; + double DerivDhopComputeTime; + ////////////////////////////////////////////////////////////////// // override multiply; cut number routines if pass dagger argument // and also make interface more uniformly consistent @@ -138,7 +152,7 @@ public: // Constructor WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, - const ImplParams &p = ImplParams(), + const ImplParams &p = ImplParams(), const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() ); // DoubleStore impl dependent @@ -170,9 +184,9 @@ public: LebesgueOrder Lebesgue; LebesgueOrder LebesgueEvenOdd; - + WilsonAnisotropyCoefficients anisotropyCoeff; - + /////////////////////////////////////////////////////////////// // Conserved current utilities /////////////////////////////////////////////////////////////// @@ -186,7 +200,7 @@ public: PropagatorField &q_out, PropagatorField &phys_src, Current curr_type, - unsigned int mu, + unsigned int mu, unsigned int tmin, unsigned int tmax, ComplexField &lattice_cmplx); @@ -196,5 +210,3 @@ typedef WilsonFermion WilsonFermionF; typedef WilsonFermion WilsonFermionD; NAMESPACE_END(Grid); - - diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 804b1d10..80231bb4 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -215,7 +215,7 @@ public: LebesgueOrder LebesgueEvenOdd; // Comms buffer - std::vector > comm_buf; + // std::vector > comm_buf; }; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index e79b64dc..b3fbe096 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -799,7 +799,7 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, PropagatorField tmp(UGrid); PropagatorField Utmp(UGrid); - LatticeInteger zz (UGrid); zz=0.0; + PropagatorField zz (UGrid); zz=0.0; LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); for (int s=0;s::SeqConservedCurrent(PropagatorField &q_in, PropagatorField tmp(UGrid); PropagatorField Utmp(UGrid); - LatticeInteger zz (UGrid); zz=0.0; + PropagatorField zz (UGrid); zz=0.0; LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); for(int s=0;s -template +template accelerator_inline void StaggeredKernels::DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -221,7 +221,7 @@ void StaggeredKernels::DhopSiteHand(StencilView &st, template -template +template accelerator_inline void StaggeredKernels::DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -300,7 +300,7 @@ void StaggeredKernels::DhopSiteHandInt(StencilView &st, template -template +template accelerator_inline void StaggeredKernels::DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index 141725a7..0b6f9fb0 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -78,7 +78,7 @@ StaggeredKernels::StaggeredKernels(const ImplParams &p) : Base(p){}; // Int, Ext, Int+Ext cases for comms overlap //////////////////////////////////////////////////////////////////////////////////// template -template +template accelerator_inline void StaggeredKernels::DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -126,7 +126,7 @@ void StaggeredKernels::DhopSiteGeneric(StencilView &st, // Only contributions from interior of our node /////////////////////////////////////////////////// template -template +template accelerator_inline void StaggeredKernels::DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -174,7 +174,7 @@ void StaggeredKernels::DhopSiteGenericInt(StencilView &st, // Only contributions from exterior of our node /////////////////////////////////////////////////// template -template +template accelerator_inline void StaggeredKernels::DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -224,7 +224,7 @@ void StaggeredKernels::DhopSiteGenericExt(StencilView &st, //////////////////////////////////////////////////////////////////////////////////// // Driving / wrapping routine to select right kernel //////////////////////////////////////////////////////////////////////////////////// -template +template void StaggeredKernels::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp) { @@ -253,7 +253,7 @@ void StaggeredKernels::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \ }); -template +template void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag, int interior,int exterior) @@ -293,7 +293,7 @@ void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, } assert(0 && " Kernel optimisation case not covered "); } -template +template void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag, int interior,int exterior) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index f647bef8..4977ea68 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -43,7 +43,7 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, const ImplParams &p, const WilsonAnisotropyCoefficients &anis) - : + : Kernels(p), _grid(&Fgrid), _cbgrid(&Hgrid), @@ -75,8 +75,93 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, StencilOdd.BuildSurfaceList(1,vol4); } +template +void WilsonFermion::Report(void) +{ + RealD NP = _grid->_Nprocessors; + RealD NN = _grid->NodeCount(); + RealD volume = 1; + Coordinate latt = _grid->GlobalDimensions(); + for(int mu=0;mu 0 ) { + std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; + std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl; + std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl; + + // Average the compute time + _grid->GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + } + + if ( DerivCalls > 0 ) { + std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; + std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " < 0 || DhopCalls > 0){ + std::cout << GridLogMessage << "WilsonFermion Stencil" < 0){ + std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" < +void WilsonFermion::ZeroCounters(void) { + DhopCalls = 0; // ok + DhopCommTime = 0; + DhopComputeTime = 0; + DhopComputeTime2= 0; + DhopFaceTime = 0; + DhopTotalTime = 0; + + DerivCalls = 0; // ok + DerivCommTime = 0; + DerivComputeTime = 0; + DerivDhopComputeTime = 0; + + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); + Stencil.ZeroCountersi(); + StencilEven.ZeroCountersi(); + StencilOdd.ZeroCountersi(); +} + + template -void WilsonFermion::ImportGauge(const GaugeField &_Umu) +void WilsonFermion::ImportGauge(const GaugeField &_Umu) { GaugeField HUmu(_Umu.Grid()); @@ -107,7 +192,7 @@ void WilsonFermion::ImportGauge(const GaugeField &_Umu) ///////////////////////////// template -void WilsonFermion::M(const FermionField &in, FermionField &out) +void WilsonFermion::M(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerNo); @@ -115,7 +200,7 @@ void WilsonFermion::M(const FermionField &in, FermionField &out) } template -void WilsonFermion::Mdag(const FermionField &in, FermionField &out) +void WilsonFermion::Mdag(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerYes); @@ -123,7 +208,7 @@ void WilsonFermion::Mdag(const FermionField &in, FermionField &out) } template -void WilsonFermion::Meooe(const FermionField &in, FermionField &out) +void WilsonFermion::Meooe(const FermionField &in, FermionField &out) { if (in.Checkerboard() == Odd) { DhopEO(in, out, DaggerNo); @@ -133,7 +218,7 @@ void WilsonFermion::Meooe(const FermionField &in, FermionField &out) } template -void WilsonFermion::MeooeDag(const FermionField &in, FermionField &out) +void WilsonFermion::MeooeDag(const FermionField &in, FermionField &out) { if (in.Checkerboard() == Odd) { DhopEO(in, out, DaggerYes); @@ -141,9 +226,9 @@ void WilsonFermion::MeooeDag(const FermionField &in, FermionField &out) DhopOE(in, out, DaggerYes); } } - + template -void WilsonFermion::Mooee(const FermionField &in, FermionField &out) +void WilsonFermion::Mooee(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); typename FermionField::scalar_type scal(diag_mass); @@ -151,80 +236,80 @@ void WilsonFermion::Mooee(const FermionField &in, FermionField &out) } template -void WilsonFermion::MooeeDag(const FermionField &in, FermionField &out) +void WilsonFermion::MooeeDag(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); Mooee(in, out); } template -void WilsonFermion::MooeeInv(const FermionField &in, FermionField &out) +void WilsonFermion::MooeeInv(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); out = (1.0/(diag_mass))*in; } - + template -void WilsonFermion::MooeeInvDag(const FermionField &in, FermionField &out) +void WilsonFermion::MooeeInvDag(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); MooeeInv(in,out); } template void WilsonFermion::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector twist) -{ +{ typedef typename FermionField::vector_type vector_type; typedef typename FermionField::scalar_type ScalComplex; typedef Lattice > LatComplex; - - // what type LatticeComplex + + // what type LatticeComplex conformable(_grid,out.Grid()); - + Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT }; - + Coordinate latt_size = _grid->_fdimensions; - + FermionField num (_grid); num = Zero(); LatComplex wilson(_grid); wilson= Zero(); LatComplex one (_grid); one = ScalComplex(1.0,0.0); - + LatComplex denom(_grid); denom= Zero(); - LatComplex kmu(_grid); + LatComplex kmu(_grid); ScalComplex ci(0.0,1.0); // momphase = n * 2pi / L for(int mu=0;mu void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat, const FermionField &A, const FermionField &B, int dag) { + DerivCalls++; assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); @@ -242,8 +328,11 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, FermionField Atilde(B.Grid()); Atilde = A; + DerivCommTime-=usecond(); st.HaloExchange(B, compressor); + DerivCommTime+=usecond(); + DerivComputeTime-=usecond(); for (int mu = 0; mu < Nd; mu++) { //////////////////////////////////////////////////////////////////////// // Flip gamma (1+g)<->(1-g) if dag @@ -251,6 +340,7 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, int gamma = mu; if (!dag) gamma += Nd; + DerivDhopComputeTime -= usecond(); int Ls=1; Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma); @@ -258,11 +348,13 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, // spin trace outer product ////////////////////////////////////////////////// Impl::InsertForce4D(mat, Btilde, Atilde, mu); + DerivDhopComputeTime += usecond(); } + DerivComputeTime += usecond(); } template -void WilsonFermion::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) +void WilsonFermion::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { conformable(U.Grid(), _grid); conformable(U.Grid(), V.Grid()); @@ -274,13 +366,13 @@ void WilsonFermion::DhopDeriv(GaugeField &mat, const FermionField &U, cons } template -void WilsonFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) +void WilsonFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { conformable(U.Grid(), _cbgrid); conformable(U.Grid(), V.Grid()); //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido) // Motivation: look at the SchurDiff operator - + assert(V.Checkerboard() == Even); assert(U.Checkerboard() == Odd); mat.Checkerboard() = Odd; @@ -289,7 +381,7 @@ void WilsonFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, co } template -void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) +void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { conformable(U.Grid(), _cbgrid); conformable(U.Grid(), V.Grid()); @@ -303,7 +395,7 @@ void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, co } template -void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int dag) +void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int dag) { conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), out.Grid()); @@ -314,7 +406,7 @@ void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int da } template -void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int dag) +void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -326,7 +418,7 @@ void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int } template -void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int dag) +void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int dag) { conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -338,18 +430,18 @@ void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int d } template -void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { DhopDir(in, out, dir, disp); } template -void WilsonFermion::MdirAll(const FermionField &in, std::vector &out) +void WilsonFermion::MdirAll(const FermionField &in, std::vector &out) { DhopDirAll(in, out); } template -void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) +void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { Compressor compressor(DaggerNo); Stencil.HaloExchange(in, compressor); @@ -361,12 +453,12 @@ void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int DhopDirCalc(in, out, dirdisp, gamma, DaggerNo); }; template -void WilsonFermion::DhopDirAll(const FermionField &in, std::vector &out) +void WilsonFermion::DhopDirAll(const FermionField &in, std::vector &out) { Compressor compressor(DaggerNo); Stencil.HaloExchange(in, compressor); - assert((out.size()==8)||(out.size()==9)); + assert((out.size()==8)||(out.size()==9)); for(int dir=0;dir::DhopDirAll(const FermionField &in, std::vector -void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) +void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) { int Ls=1; uint64_t Nsite=in.oSites(); @@ -390,22 +482,23 @@ template void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) + FermionField &out, int dag) { + DhopTotalTime-=usecond(); #ifdef GRID_OMP if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,in,out,dag); else -#endif +#endif DhopInternalSerial(st,lo,U,in,out,dag); - + DhopTotalTime+=usecond(); } template void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) + FermionField &out, int dag) { assert((dag == DaggerNo) || (dag == DaggerYes)); @@ -417,38 +510,53 @@ void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO ///////////////////////////// std::vector > requests; st.Prepare(); + DhopFaceTime-=usecond(); st.HaloGather(in,compressor); + DhopFaceTime+=usecond(); + + DhopCommTime -=usecond(); st.CommunicateBegin(requests); ///////////////////////////// // Overlap with comms ///////////////////////////// + DhopFaceTime-=usecond(); st.CommsMergeSHM(compressor); + DhopFaceTime+=usecond(); ///////////////////////////// // do the compute interior ///////////////////////////// int Opt = WilsonKernelsStatic::Opt; + DhopComputeTime-=usecond(); if (dag == DaggerYes) { Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); } else { Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); - } + } + DhopComputeTime+=usecond(); ///////////////////////////// // Complete comms ///////////////////////////// st.CommunicateComplete(requests); + DhopCommTime +=usecond(); + + DhopFaceTime-=usecond(); st.CommsMerge(compressor); + DhopFaceTime+=usecond(); ///////////////////////////// // do the compute exterior ///////////////////////////// + + DhopComputeTime2-=usecond(); if (dag == DaggerYes) { Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); } else { Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); } + DhopComputeTime2+=usecond(); }; @@ -456,24 +564,28 @@ template void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) + FermionField &out, int dag) { assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); + DhopCommTime-=usecond(); st.HaloExchange(in, compressor); + DhopCommTime+=usecond(); + DhopComputeTime-=usecond(); int Opt = WilsonKernelsStatic::Opt; if (dag == DaggerYes) { Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); } else { Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); } + DhopComputeTime+=usecond(); }; /*Change ends */ /******************************************************************************* * Conserved current utilities for Wilson fermions, for contracting propagators - * to make a conserved current sink or inserting the conserved current + * to make a conserved current sink or inserting the conserved current * sequentially. ******************************************************************************/ template @@ -493,12 +605,12 @@ void WilsonFermion::ContractConservedCurrent(PropagatorField &q_in_1, template -void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, +void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, PropagatorField &q_out, PropagatorField &src, Current curr_type, unsigned int mu, - unsigned int tmin, + unsigned int tmin, unsigned int tmax, ComplexField &lattice_cmplx) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h new file mode 100644 index 00000000..2e587dfa --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -0,0 +1,574 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + + + Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h + + Copyright (C) 2020 + +Author: Nils Meyer Regensburg University + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +//#if defined(A64FXASM) +#if defined(A64FX) + +// safety include +#include + +// undefine everything related to kernels +#include + +// enable A64FX body +#define WILSONKERNELSASMBODYA64FX +//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") + + /////////////////////////////////////////////////////////// + // If we are A64FX specialise the single precision routine + /////////////////////////////////////////////////////////// +#if defined(DSLASHINTRIN) +//#pragma message ("A64FX Dslash: intrin") +#include +#else +#pragma message ("A64FX Dslash: asm") +#include +#endif + +/// Switch off the 5d vectorised code optimisations +#undef DWFVEC5D + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, undag Kernel, single +///////////////////////////////////////////////////////////////// +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, dag Kernel, single +///////////////////////////////////////////////////////////////// +#define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + + +// undefine +#include + +/////////////////////////////////////////////////////////// +// If we are A64FX specialise the double precision routine +/////////////////////////////////////////////////////////// + +#if defined(DSLASHINTRIN) +#include +#else +#include +#endif + +// former KNL +//#define MAYBEPERM(A,perm) if (perm) { A ; } +//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) +//#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0]; + + +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, undag Kernel, double +///////////////////////////////////////////////////////////////// +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, dag Kernel, double +///////////////////////////////////////////////////////////////// +#define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + + + +// undefs +#undef WILSONKERNELSASMBODYA64FX +#include + +#endif //A64FXASM diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h new file mode 100644 index 00000000..406e5c25 --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -0,0 +1,380 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: WilsonKernelsAsmBodyA64FX.h + + Copyright (C) 2020 + +Author: Nils Meyer Regensburg University + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifdef KERNEL_DAG +#define DIR0_PROJ XP_PROJ +#define DIR1_PROJ YP_PROJ +#define DIR2_PROJ ZP_PROJ +#define DIR3_PROJ TP_PROJ +#define DIR4_PROJ XM_PROJ +#define DIR5_PROJ YM_PROJ +#define DIR6_PROJ ZM_PROJ +#define DIR7_PROJ TM_PROJ +#define DIR0_RECON XP_RECON +#define DIR1_RECON YP_RECON_ACCUM +#define DIR2_RECON ZP_RECON_ACCUM +#define DIR3_RECON TP_RECON_ACCUM +#define DIR4_RECON XM_RECON_ACCUM +#define DIR5_RECON YM_RECON_ACCUM +#define DIR6_RECON ZM_RECON_ACCUM +#define DIR7_RECON TM_RECON_ACCUM +#else +#define DIR0_PROJ XM_PROJ +#define DIR1_PROJ YM_PROJ +#define DIR2_PROJ ZM_PROJ +#define DIR3_PROJ TM_PROJ +#define DIR4_PROJ XP_PROJ +#define DIR5_PROJ YP_PROJ +#define DIR6_PROJ ZP_PROJ +#define DIR7_PROJ TP_PROJ +#define DIR0_RECON XM_RECON +#define DIR1_RECON YM_RECON_ACCUM +#define DIR2_RECON ZM_RECON_ACCUM +#define DIR3_RECON TM_RECON_ACCUM +#define DIR4_RECON XP_RECON_ACCUM +#define DIR5_RECON YP_RECON_ACCUM +#define DIR6_RECON ZP_RECON_ACCUM +#define DIR7_RECON TP_RECON_ACCUM +#endif + +//using namespace std; + +#undef SHOW +//#define SHOW + +#undef WHERE + +#ifdef INTERIOR_AND_EXTERIOR +#define WHERE "INT_AND_EXT" +#endif + +#ifdef INTERIOR +#define WHERE "INT" +#endif + +#ifdef EXTERIOR +#define WHERE "EXT" +#endif + +//#pragma message("here") + + + +//////////////////////////////////////////////////////////////////////////////// +// Comms then compute kernel +//////////////////////////////////////////////////////////////////////////////// +#ifdef INTERIOR_AND_EXTERIOR + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD_CHIMU(base); \ + LOAD_TABLE(PERMUTE_DIR); \ + PROJ; \ + MAYBEPERM(PERMUTE_DIR,perm); \ + } else { \ + LOAD_CHI(base); \ + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + PREFETCH_CHIMU_L2(basep); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PREFETCH1_CHIMU(base); \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) + +#define RESULT(base,basep) SAVE_RESULT(base,basep); + +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Pre comms kernel -- prefetch like normal because it is mostly right +//////////////////////////////////////////////////////////////////////////////// +#ifdef INTERIOR + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD_CHIMU(base); \ + LOAD_TABLE(PERMUTE_DIR); \ + PROJ; \ + MAYBEPERM(PERMUTE_DIR,perm); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + PREFETCH_CHIMU_L2(basep); \ + } else { PREFETCH_CHIMU(base); } \ + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PREFETCH1_CHIMU(base); \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) + +#define RESULT(base,basep) SAVE_RESULT(base,basep); + +#endif +//////////////////////////////////////////////////////////////////////////////// +// Post comms kernel +//////////////////////////////////////////////////////////////////////////////// +#ifdef EXTERIOR + + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + nmu++; \ + } + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + nmu++; \ + } + +#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} + +#endif +{ + int nmu; + int local,perm, ptype; + uint64_t base; + uint64_t basep; + const uint64_t plocal =(uint64_t) & in[0]; + + MASK_REGS; + int nmax=U.oSites(); + for(int site=0;site=nmax) ssn=0; + // int sUn=lo.Reorder(ssn); + int sUn=ssn; + LOCK_GAUGE(0); +#else + int sU =ssU; + int ssn=ssU+1; if(ssn>=nmax) ssn=0; + int sUn=ssn; +#endif + for(int s=0;s void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid); HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid); HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid); HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid); HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid); nmu = 0; \ HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index f7b018fa..89ae5668 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -495,7 +495,7 @@ Author: paboyle NAMESPACE_BEGIN(Grid); -template void +template accelerator_inline void WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -519,7 +519,7 @@ WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site HAND_RESULT(ss); } -template +template accelerator_inline void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -542,7 +542,7 @@ void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView HAND_RESULT(ss); } -template void +template accelerator_inline void WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -566,7 +566,7 @@ WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si HAND_RESULT(ss); } -template +template accelerator_inline void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -589,7 +589,7 @@ void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi HAND_RESULT(ss); } -template void +template accelerator_inline void WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -614,7 +614,7 @@ WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si HAND_RESULT_EXT(ss); } -template +template accelerator_inline void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.debug b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.debug new file mode 100644 index 00000000..6f3edbb5 --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h.debug @@ -0,0 +1,943 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +#pragma once + +#include + + +#undef LOAD_CHIMU +#undef LOAD_CHI +#undef MULT_2SPIN +#undef PERMUTE_DIR +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef ZERO_RESULT +#undef Chimu_00 +#undef Chimu_01 +#undef Chimu_02 +#undef Chimu_10 +#undef Chimu_11 +#undef Chimu_12 +#undef Chimu_20 +#undef Chimu_21 +#undef Chimu_22 +#undef Chimu_30 +#undef Chimu_31 +#undef Chimu_32 +#undef HAND_STENCIL_LEG +#undef HAND_STENCIL_LEG_INT +#undef HAND_STENCIL_LEG_EXT +#undef HAND_RESULT +#undef HAND_RESULT_INT +#undef HAND_RESULT_EXT + +#define REGISTER + +#define LOAD_CHIMU \ + {const SiteSpinor & ref (in[offset]); \ + Chimu_00=ref()(0)(0);\ + Chimu_01=ref()(0)(1);\ + Chimu_02=ref()(0)(2);\ + Chimu_10=ref()(1)(0);\ + Chimu_11=ref()(1)(1);\ + Chimu_12=ref()(1)(2);\ + Chimu_20=ref()(2)(0);\ + Chimu_21=ref()(2)(1);\ + Chimu_22=ref()(2)(2);\ + Chimu_30=ref()(3)(0);\ + Chimu_31=ref()(3)(1);\ + Chimu_32=ref()(3)(2);\ + std::cout << std::endl << "DEBUG -- LOAD_CHIMU" << std::endl; \ + std::cout << "Chimu_00 -- " << Chimu_00 << std::endl; \ + std::cout << "Chimu_01 -- " << Chimu_01 << std::endl; \ + std::cout << "Chimu_02 -- " << Chimu_02 << std::endl; \ + std::cout << "Chimu_10 -- " << Chimu_10 << std::endl; \ + std::cout << "Chimu_11 -- " << Chimu_11 << std::endl; \ + std::cout << "Chimu_12 -- " << Chimu_12 << std::endl; \ + std::cout << "Chimu_20 -- " << Chimu_20 << std::endl; \ + std::cout << "Chimu_21 -- " << Chimu_21 << std::endl; \ + std::cout << "Chimu_22 -- " << Chimu_22 << std::endl; \ + std::cout << "Chimu_30 -- " << Chimu_30 << std::endl; \ + std::cout << "Chimu_31 -- " << Chimu_31 << std::endl; \ + std::cout << "Chimu_32 -- " << Chimu_32 << std::endl; \ +} + +#define LOAD_CHI\ + {const SiteHalfSpinor &ref(buf[offset]); \ + Chi_00 = ref()(0)(0);\ + Chi_01 = ref()(0)(1);\ + Chi_02 = ref()(0)(2);\ + Chi_10 = ref()(1)(0);\ + Chi_11 = ref()(1)(1);\ + Chi_12 = ref()(1)(2);\ + std::cout << std::endl << "DEBUG -- LOAD_CHI" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ + } + +// To splat or not to splat depends on the implementation +#define MULT_2SPIN(A)\ + {auto & ref(U[sU](A)); \ + Impl::loadLinkElement(U_00,ref()(0,0)); \ + Impl::loadLinkElement(U_10,ref()(1,0)); \ + Impl::loadLinkElement(U_20,ref()(2,0)); \ + Impl::loadLinkElement(U_01,ref()(0,1)); \ + Impl::loadLinkElement(U_11,ref()(1,1)); \ + Impl::loadLinkElement(U_21,ref()(2,1)); \ + UChi_00 = U_00*Chi_00;\ + UChi_10 = U_00*Chi_10;\ + UChi_01 = U_10*Chi_00;\ + UChi_11 = U_10*Chi_10;\ + UChi_02 = U_20*Chi_00;\ + UChi_12 = U_20*Chi_10;\ + UChi_00+= U_01*Chi_01;\ + UChi_10+= U_01*Chi_11;\ + UChi_01+= U_11*Chi_01;\ + UChi_11+= U_11*Chi_11;\ + UChi_02+= U_21*Chi_01;\ + UChi_12+= U_21*Chi_11;\ + Impl::loadLinkElement(U_00,ref()(0,2)); \ + Impl::loadLinkElement(U_10,ref()(1,2)); \ + Impl::loadLinkElement(U_20,ref()(2,2)); \ + UChi_00+= U_00*Chi_02;\ + UChi_10+= U_00*Chi_12;\ + UChi_01+= U_10*Chi_02;\ + UChi_11+= U_10*Chi_12;\ + UChi_02+= U_20*Chi_02;\ + UChi_12+= U_20*Chi_12;\ + std::cout << std::endl << "DEBUG -- MULT_2SPIN" << std::endl; \ + std::cout << "UChi_00 -- " << UChi_00 << std::endl; \ + std::cout << "UChi_01 -- " << UChi_01 << std::endl; \ + std::cout << "UChi_02 -- " << UChi_02 << std::endl; \ + std::cout << "UChi_10 -- " << UChi_10 << std::endl; \ + std::cout << "UChi_11 -- " << UChi_11 << std::endl; \ + std::cout << "UChi_12 -- " << UChi_12 << std::endl; \ + } + + +#define PERMUTE_DIR(dir) \ +std::cout << std::endl << "DEBUG -- PERM PRE" << std::endl; \ +std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ +std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ +std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ +std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ +std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ +std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ + permute##dir(Chi_00,Chi_00);\ + permute##dir(Chi_01,Chi_01);\ + permute##dir(Chi_02,Chi_02);\ + permute##dir(Chi_10,Chi_10);\ + permute##dir(Chi_11,Chi_11);\ + permute##dir(Chi_12,Chi_12);\ + std::cout << std::endl << "DEBUG -- PERM POST" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +// hspin(0)=fspin(0)+timesI(fspin(3)); +// hspin(1)=fspin(1)+timesI(fspin(2)); +#define XP_PROJ \ + Chi_00 = Chimu_00+timesI(Chimu_30);\ + Chi_01 = Chimu_01+timesI(Chimu_31);\ + Chi_02 = Chimu_02+timesI(Chimu_32);\ + Chi_10 = Chimu_10+timesI(Chimu_20);\ + Chi_11 = Chimu_11+timesI(Chimu_21);\ + Chi_12 = Chimu_12+timesI(Chimu_22);\ + std::cout << std::endl << "DEBUG -- XP_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define YP_PROJ \ + Chi_00 = Chimu_00-Chimu_30;\ + Chi_01 = Chimu_01-Chimu_31;\ + Chi_02 = Chimu_02-Chimu_32;\ + Chi_10 = Chimu_10+Chimu_20;\ + Chi_11 = Chimu_11+Chimu_21;\ + Chi_12 = Chimu_12+Chimu_22;\ + std::cout << std::endl << "DEBUG -- YP_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define ZP_PROJ \ + Chi_00 = Chimu_00+timesI(Chimu_20); \ + Chi_01 = Chimu_01+timesI(Chimu_21); \ + Chi_02 = Chimu_02+timesI(Chimu_22); \ + Chi_10 = Chimu_10-timesI(Chimu_30); \ + Chi_11 = Chimu_11-timesI(Chimu_31); \ + Chi_12 = Chimu_12-timesI(Chimu_32);\ + std::cout << std::endl << "DEBUG -- ZP_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define TP_PROJ \ + Chi_00 = Chimu_00+Chimu_20; \ + Chi_01 = Chimu_01+Chimu_21; \ + Chi_02 = Chimu_02+Chimu_22; \ + Chi_10 = Chimu_10+Chimu_30; \ + Chi_11 = Chimu_11+Chimu_31; \ + Chi_12 = Chimu_12+Chimu_32;\ + std::cout << std::endl << "DEBUG -- TP_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + + +// hspin(0)=fspin(0)-timesI(fspin(3)); +// hspin(1)=fspin(1)-timesI(fspin(2)); +#define XM_PROJ \ + Chi_00 = Chimu_00-timesI(Chimu_30);\ + Chi_01 = Chimu_01-timesI(Chimu_31);\ + Chi_02 = Chimu_02-timesI(Chimu_32);\ + Chi_10 = Chimu_10-timesI(Chimu_20);\ + Chi_11 = Chimu_11-timesI(Chimu_21);\ + Chi_12 = Chimu_12-timesI(Chimu_22);\ + std::cout << std::endl << "DEBUG -- XM_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define YM_PROJ \ + Chi_00 = Chimu_00+Chimu_30;\ + Chi_01 = Chimu_01+Chimu_31;\ + Chi_02 = Chimu_02+Chimu_32;\ + Chi_10 = Chimu_10-Chimu_20;\ + Chi_11 = Chimu_11-Chimu_21;\ + Chi_12 = Chimu_12-Chimu_22;\ + std::cout << std::endl << "DEBUG -- YM_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define ZM_PROJ \ + Chi_00 = Chimu_00-timesI(Chimu_20); \ + Chi_01 = Chimu_01-timesI(Chimu_21); \ + Chi_02 = Chimu_02-timesI(Chimu_22); \ + Chi_10 = Chimu_10+timesI(Chimu_30); \ + Chi_11 = Chimu_11+timesI(Chimu_31); \ + Chi_12 = Chimu_12+timesI(Chimu_32);\ + std::cout << std::endl << "DEBUG -- ZM_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define TM_PROJ \ + Chi_00 = Chimu_00-Chimu_20; \ + Chi_01 = Chimu_01-Chimu_21; \ + Chi_02 = Chimu_02-Chimu_22; \ + Chi_10 = Chimu_10-Chimu_30; \ + Chi_11 = Chimu_11-Chimu_31; \ + Chi_12 = Chimu_12-Chimu_32;\ + std::cout << std::endl << "DEBUG -- TM_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +// fspin(0)=hspin(0); +// fspin(1)=hspin(1); +// fspin(2)=timesMinusI(hspin(1)); +// fspin(3)=timesMinusI(hspin(0)); +#define XP_RECON\ + result_00 = UChi_00;\ + result_01 = UChi_01;\ + result_02 = UChi_02;\ + result_10 = UChi_10;\ + result_11 = UChi_11;\ + result_12 = UChi_12;\ + result_20 = timesMinusI(UChi_10);\ + result_21 = timesMinusI(UChi_11);\ + result_22 = timesMinusI(UChi_12);\ + result_30 = timesMinusI(UChi_00);\ + result_31 = timesMinusI(UChi_01);\ + result_32 = timesMinusI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XP_RECON" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define XP_RECON_ACCUM\ + result_00+=UChi_00;\ + result_01+=UChi_01;\ + result_02+=UChi_02;\ + result_10+=UChi_10;\ + result_11+=UChi_11;\ + result_12+=UChi_12;\ + result_20-=timesI(UChi_10);\ + result_21-=timesI(UChi_11);\ + result_22-=timesI(UChi_12);\ + result_30-=timesI(UChi_00);\ + result_31-=timesI(UChi_01);\ + result_32-=timesI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XP_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define XM_RECON\ + result_00 = UChi_00;\ + result_01 = UChi_01;\ + result_02 = UChi_02;\ + result_10 = UChi_10;\ + result_11 = UChi_11;\ + result_12 = UChi_12;\ + result_20 = timesI(UChi_10);\ + result_21 = timesI(UChi_11);\ + result_22 = timesI(UChi_12);\ + result_30 = timesI(UChi_00);\ + result_31 = timesI(UChi_01);\ + result_32 = timesI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XM_RECON" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define XM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= timesI(UChi_10);\ + result_21+= timesI(UChi_11);\ + result_22+= timesI(UChi_12);\ + result_30+= timesI(UChi_00);\ + result_31+= timesI(UChi_01);\ + result_32+= timesI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XM_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define YP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= UChi_10;\ + result_21+= UChi_11;\ + result_22+= UChi_12;\ + result_30-= UChi_00;\ + result_31-= UChi_01;\ + result_32-= UChi_02;\ + std::cout << std::endl << "DEBUG -- YP_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define YM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= UChi_10;\ + result_21-= UChi_11;\ + result_22-= UChi_12;\ + result_30+= UChi_00;\ + result_31+= UChi_01;\ + result_32+= UChi_02;\ + std::cout << std::endl << "DEBUG -- YM_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define ZP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= timesI(UChi_00); \ + result_21-= timesI(UChi_01); \ + result_22-= timesI(UChi_02); \ + result_30+= timesI(UChi_10); \ + result_31+= timesI(UChi_11); \ + result_32+= timesI(UChi_12);\ + std::cout << std::endl << "DEBUG -- ZP_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define ZM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= timesI(UChi_00); \ + result_21+= timesI(UChi_01); \ + result_22+= timesI(UChi_02); \ + result_30-= timesI(UChi_10); \ + result_31-= timesI(UChi_11); \ + result_32-= timesI(UChi_12);\ + std::cout << std::endl << "DEBUG -- ZM_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define TP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= UChi_00; \ + result_21+= UChi_01; \ + result_22+= UChi_02; \ + result_30+= UChi_10; \ + result_31+= UChi_11; \ + result_32+= UChi_12;\ + std::cout << std::endl << "DEBUG -- TP_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define TM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= UChi_00; \ + result_21-= UChi_01; \ + result_22-= UChi_02; \ + result_30-= UChi_10; \ + result_31-= UChi_11; \ + result_32-= UChi_12;\ + std::cout << std::endl << "DEBUG -- TM_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else { \ + LOAD_CHI; \ + } \ + MULT_2SPIN(DIR); \ + RECON; + +#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else if ( st.same_node[DIR] ) { \ + LOAD_CHI; \ + } \ + if (local || st.same_node[DIR] ) { \ + MULT_2SPIN(DIR); \ + RECON; \ + } + +#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ + LOAD_CHI; \ + MULT_2SPIN(DIR); \ + RECON; \ + nmu++; \ + } + +#define HAND_RESULT(ss) \ + { \ + SiteSpinor & ref (out[ss]); \ + vstream(ref()(0)(0),result_00); \ + vstream(ref()(0)(1),result_01); \ + vstream(ref()(0)(2),result_02); \ + vstream(ref()(1)(0),result_10); \ + vstream(ref()(1)(1),result_11); \ + vstream(ref()(1)(2),result_12); \ + vstream(ref()(2)(0),result_20); \ + vstream(ref()(2)(1),result_21); \ + vstream(ref()(2)(2),result_22); \ + vstream(ref()(3)(0),result_30); \ + vstream(ref()(3)(1),result_31); \ + vstream(ref()(3)(2),result_32); \ + std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl;\ + } + +#define HAND_RESULT_EXT(ss) \ + if (nmu){ \ + SiteSpinor & ref (out[ss]); \ + ref()(0)(0)+=result_00; \ + ref()(0)(1)+=result_01; \ + ref()(0)(2)+=result_02; \ + ref()(1)(0)+=result_10; \ + ref()(1)(1)+=result_11; \ + ref()(1)(2)+=result_12; \ + ref()(2)(0)+=result_20; \ + ref()(2)(1)+=result_21; \ + ref()(2)(2)+=result_22; \ + ref()(3)(0)+=result_30; \ + ref()(3)(1)+=result_31; \ + ref()(3)(2)+=result_32; \ + std::cout << std::endl << "DEBUG -- RESULT EXT" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl;\ + } + + +#define HAND_DECLARATIONS(a) \ + Simd result_00; \ + Simd result_01; \ + Simd result_02; \ + Simd result_10; \ + Simd result_11; \ + Simd result_12; \ + Simd result_20; \ + Simd result_21; \ + Simd result_22; \ + Simd result_30; \ + Simd result_31; \ + Simd result_32; \ + Simd Chi_00; \ + Simd Chi_01; \ + Simd Chi_02; \ + Simd Chi_10; \ + Simd Chi_11; \ + Simd Chi_12; \ + Simd UChi_00; \ + Simd UChi_01; \ + Simd UChi_02; \ + Simd UChi_10; \ + Simd UChi_11; \ + Simd UChi_12; \ + Simd U_00; \ + Simd U_10; \ + Simd U_20; \ + Simd U_01; \ + Simd U_11; \ + Simd U_21;\ + Simd debugreg;\ + svbool_t pg1; \ + pg1 = svptrue_b64(); \ + +#define ZERO_RESULT \ + result_00=Zero(); \ + result_01=Zero(); \ + result_02=Zero(); \ + result_10=Zero(); \ + result_11=Zero(); \ + result_12=Zero(); \ + result_20=Zero(); \ + result_21=Zero(); \ + result_22=Zero(); \ + result_30=Zero(); \ + result_31=Zero(); \ + result_32=Zero(); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 + +NAMESPACE_BEGIN(Grid); + +template void +WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + + HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); + HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} + +template +void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + + HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); + HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} + +template void +WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + ZERO_RESULT; + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} + +template +void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + ZERO_RESULT; + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} + +template void +WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset, ptype; + StencilEntry *SE; + int nmu=0; + ZERO_RESULT; + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT_EXT(ss); +} + +template +void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset, ptype; + int nmu=0; + ZERO_RESULT; + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT_EXT(ss); +} + +////////////// Wilson ; uses this implementation ///////////////////// + +NAMESPACE_END(Grid); +#undef LOAD_CHIMU +#undef LOAD_CHI +#undef MULT_2SPIN +#undef PERMUTE_DIR +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef ZERO_RESULT +#undef Chimu_00 +#undef Chimu_01 +#undef Chimu_02 +#undef Chimu_10 +#undef Chimu_11 +#undef Chimu_12 +#undef Chimu_20 +#undef Chimu_21 +#undef Chimu_22 +#undef Chimu_30 +#undef Chimu_31 +#undef Chimu_32 +#undef HAND_STENCIL_LEG +#undef HAND_STENCIL_LEG_INT +#undef HAND_STENCIL_LEG_EXT +#undef HAND_RESULT +#undef HAND_RESULT_INT +#undef HAND_RESULT_EXT diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 9ca29367..c5f50bbb 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -43,11 +43,11 @@ NAMESPACE_BEGIN(Grid); accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) { #ifdef GRID_SIMT - static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); + static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads uint4 * chip_pun = (uint4 *)&chip; * chip_pun = * mem_pun; -#else +#else chip = *mem; #endif return; @@ -66,7 +66,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) acceleratorSynchronise(); \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Recon(result, Uchi); - + #define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ if (SE->_is_local) { \ @@ -81,7 +81,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Recon(result, Uchi); \ } \ - acceleratorSynchronise(); + acceleratorSynchronise(); #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ @@ -91,7 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) Recon(result, Uchi); \ nmu++; \ } \ - acceleratorSynchronise(); + acceleratorSynchronise(); #define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \ if (SE->_is_local ) { \ @@ -103,7 +103,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } \ acceleratorSynchronise(); \ Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \ - Recon(result, Uchi); + Recon(result, Uchi); #define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ if (gamma == Dir) { \ @@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) //////////////////////////////////////////////////////////////////// // All legs kernels ; comms then compute //////////////////////////////////////////////////////////////////// -template +template accelerator_inline void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -140,10 +140,10 @@ void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV coalescedWrite(out[sF],result,lane); }; -template +template accelerator_inline void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, - int sU, const FermionFieldView &in, FermionFieldView &out) + int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor; @@ -169,7 +169,7 @@ void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView //////////////////////////////////////////////////////////////////// // Interior kernels //////////////////////////////////////////////////////////////////// -template +template accelerator_inline void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -197,10 +197,10 @@ void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi coalescedWrite(out[sF], result,lane); }; -template +template accelerator_inline void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, - int sU, const FermionFieldView &in, FermionFieldView &out) + int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor; @@ -227,7 +227,7 @@ void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeField //////////////////////////////////////////////////////////////////// // Exterior kernels //////////////////////////////////////////////////////////////////// -template +template accelerator_inline void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -251,17 +251,17 @@ void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm); GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm); GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm); - if ( nmu ) { + if ( nmu ) { auto out_t = coalescedRead(out[sF],lane); out_t = out_t + result; coalescedWrite(out[sF],out_t,lane); } }; -template +template accelerator_inline void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, - int sU, const FermionFieldView &in, FermionFieldView &out) + int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor; @@ -282,7 +282,7 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm); GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm); GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm); - if ( nmu ) { + if ( nmu ) { auto out_t = coalescedRead(out[sF],lane); out_t = out_t + result; coalescedWrite(out[sF],out_t,lane); @@ -290,7 +290,7 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField }; #define DhopDirMacro(Dir,spProj,spRecon) \ - template \ + template accelerator_inline \ void WilsonKernels::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \ int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \ { \ @@ -307,7 +307,7 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField SE = st.GetEntry(ptype, dir, sF); \ GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \ coalescedWrite(out[sF], result,lane); \ - } + } DhopDirMacro(Xp,spProjXp,spReconXp); DhopDirMacro(Yp,spProjYp,spReconYp); @@ -318,9 +318,9 @@ DhopDirMacro(Ym,spProjYm,spReconYm); DhopDirMacro(Zm,spProjZm,spReconZm); DhopDirMacro(Tm,spProjTm,spReconTm); -template +template accelerator_inline void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, - int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) + int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor; @@ -346,7 +346,7 @@ void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si template void WilsonKernels::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, - int Nsite, const FermionField &in, std::vector &out) + int Nsite, const FermionField &in, std::vector &out) { autoView(U_v ,U,AcceleratorRead); autoView(in_v ,in,AcceleratorRead); @@ -362,8 +362,8 @@ void WilsonKernels::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,Site autoView(out_Tp,out[7],AcceleratorWrite); auto CBp=st.CommBuf(); accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{ - int sU=sss/Ls; - int sF =sss; + int sU=sss/Ls; + int sF =sss; DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0); DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1); DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2); @@ -378,7 +378,7 @@ void WilsonKernels::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,Site template void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, - int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) + int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) { assert(dirdisp<=7); assert(dirdisp>=0); @@ -387,7 +387,7 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S autoView(in_v ,in ,AcceleratorRead); autoView(out_v,out,AcceleratorWrite); autoView(st_v ,st ,AcceleratorRead); - auto CBp=st.CommBuf(); + auto CBp=st.CommBuf(); #define LoopBody(Dir) \ case Dir : \ accelerator_for(ss,Nsite,Simd::Nsimd(),{ \ @@ -414,7 +414,7 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S break; } #undef LoopBody -} +} #define KERNEL_CALLNB(A) \ const uint64_t NN = Nsite*Ls; \ @@ -424,7 +424,7 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ }); -#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); +#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); #define ASM_CALL(A) \ thread_for( ss, Nsite, { \ @@ -436,14 +436,14 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S template void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, - int interior,int exterior) + int interior,int exterior) { autoView(U_v , U,AcceleratorRead); autoView(in_v , in,AcceleratorRead); autoView(out_v,out,AcceleratorWrite); autoView(st_v , st,AcceleratorRead); - if( interior && exterior ) { + if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} @@ -455,7 +455,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif - } else if( exterior ) { + } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} @@ -467,14 +467,14 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField template void WilsonKernels::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, - int interior,int exterior) + int interior,int exterior) { autoView(U_v ,U,AcceleratorRead); autoView(in_v ,in,AcceleratorRead); autoView(out_v,out,AcceleratorWrite); autoView(st_v ,st,AcceleratorRead); - if( interior && exterior ) { + if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} @@ -486,7 +486,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif - } else if( exterior ) { + } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} @@ -501,4 +501,3 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField #undef ASM_CALL NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master index 9af5ed85..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master +++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,14 +35,17 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif +#endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc index f6f235c8..a8e9e6d9 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc @@ -37,6 +37,7 @@ directory //////////////////////////////////////////////////////////////////////// NAMESPACE_BEGIN(Grid); #include +#include #include NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h index 19bc5aa6..a14aec1b 100644 --- a/Grid/qcd/action/gauge/GaugeImplementations.h +++ b/Grid/qcd/action/gauge/GaugeImplementations.h @@ -59,7 +59,7 @@ public: } static inline GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { - return Cshift(closure(adj(Link)), mu, -1); + return Cshift(adj(Link), mu, -1); } static inline GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 91af7372..d5475704 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -301,9 +301,9 @@ public: t_P[level] = 0; } - for (int step = 0; step < Params.MDsteps; ++step) { // MD step - int first_step = (step == 0); - int last_step = (step == Params.MDsteps - 1); + for (int stp = 0; stp < Params.MDsteps; ++stp) { // MD step + int first_step = (stp == 0); + int last_step = (stp == Params.MDsteps - 1); this->step(U, 0, first_step, last_step); } diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h index 6ac69150..cee1fa12 100644 --- a/Grid/qcd/utils/CovariantCshift.h +++ b/Grid/qcd/utils/CovariantCshift.h @@ -53,23 +53,21 @@ namespace PeriodicBC { return Cshift(tmp,mu,-1);// moves towards positive mu } - template auto - CovShiftForward(const Lattice &Link, - int mu, - const LatticeUnaryExpression &expr) - -> Lattice + template::value,void>::type * = nullptr> + auto CovShiftForward(const Lattice &Link, + int mu, + const Expr &expr) -> decltype(closure(expr)) { - Lattice arg(expr); + auto arg = closure(expr); return CovShiftForward(Link,mu,arg); } - template auto - CovShiftBackward(const Lattice &Link, - int mu, - const LatticeUnaryExpression &expr) - -> Lattice + template::value,void>::type * = nullptr> + auto CovShiftBackward(const Lattice &Link, + int mu, + const Expr &expr) -> decltype(closure(expr)) { - Lattice arg(expr); - return CovShiftForward(Link,mu,arg); + auto arg = closure(expr); + return CovShiftBackward(Link,mu,arg); } } @@ -142,26 +140,23 @@ namespace ConjugateBC { return Cshift(tmp,mu,-1);// moves towards positive mu } - template auto - CovShiftForward(const Lattice &Link, - int mu, - const LatticeUnaryExpression &expr) - -> Lattice + template::value,void>::type * = nullptr> + auto CovShiftForward(const Lattice &Link, + int mu, + const Expr &expr) -> decltype(closure(expr)) { - Lattice arg(expr); + auto arg = closure(expr); return CovShiftForward(Link,mu,arg); } - template auto - CovShiftBackward(const Lattice &Link, - int mu, - const LatticeUnaryExpression &expr) - -> Lattice + template::value,void>::type * = nullptr> + auto CovShiftBackward(const Lattice &Link, + int mu, + const Expr &expr) -> decltype(closure(expr)) { - Lattice arg(expr); - return CovShiftForward(Link,mu,arg); + auto arg = closure(expr); + return CovShiftBackward(Link,mu,arg); } - } diff --git a/Grid/qcd/utils/SUnAdjoint.h b/Grid/qcd/utils/SUnAdjoint.h index 1d530373..18d6b875 100644 --- a/Grid/qcd/utils/SUnAdjoint.h +++ b/Grid/qcd/utils/SUnAdjoint.h @@ -39,7 +39,7 @@ public: typedef iSUnAdjointMatrix AMatrixF; typedef iSUnAdjointMatrix AMatrixD; - typedef iSUnAdjointMatrix vAMatrix; + typedef iSUnAdjointMatrix vAMatrix; typedef iSUnAdjointMatrix vAMatrixF; typedef iSUnAdjointMatrix vAMatrixD; @@ -47,14 +47,9 @@ public: typedef Lattice LatticeAdjMatrixF; typedef Lattice LatticeAdjMatrixD; - typedef Lattice >, Nd> > - LatticeAdjField; - typedef Lattice >, Nd> > - LatticeAdjFieldF; - typedef Lattice >, Nd> > - LatticeAdjFieldD; - - + typedef Lattice >, Nd> > LatticeAdjField; + typedef Lattice >, Nd> > LatticeAdjFieldF; + typedef Lattice >, Nd> > LatticeAdjFieldD; template @@ -128,7 +123,9 @@ public: } // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 ) - static void projectOnAlgebra(typename SU::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) { + static void projectOnAlgebra(typename SU::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) + { + conformable(h_out, in); h_out = Zero(); AMatrix iTa; @@ -136,7 +133,7 @@ public: for (int a = 0; a < Dimension; a++) { generator(a, iTa); - auto tmp = real(trace(iTa * in)) * coefficient; + LatticeComplex tmp = real(trace(iTa * in)) * coefficient; pokeColour(h_out, tmp, a); } } diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index fdd53698..0367c9fa 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -485,7 +485,7 @@ public: // Up staple ___ ___ // | | - tmp = Cshift(closure(adj(U[nu])), nu, -1); + tmp = Cshift(adj(U[nu]), nu, -1); tmp = adj(U2[mu]) * tmp; tmp = Cshift(tmp, mu, -2); @@ -519,7 +519,7 @@ public: // // | | - tmp = Cshift(closure(adj(U2[nu])), nu, -2); + tmp = Cshift(adj(U2[nu]), nu, -2); tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp); tmp = U2[nu] * Cshift(tmp, nu, 2); Stap += Cshift(tmp, mu, 1); diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h new file mode 100644 index 00000000..76c556d7 --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -0,0 +1,779 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_asm_double.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) +#define PF_GAUGE(A) +#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A) +#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXd +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd +#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) +#define XP_PROJ XP_PROJ_A64FXd +#define YP_PROJ YP_PROJ_A64FXd +#define ZP_PROJ ZP_PROJ_A64FXd +#define TP_PROJ TP_PROJ_A64FXd +#define XM_PROJ XM_PROJ_A64FXd +#define YM_PROJ YM_PROJ_A64FXd +#define ZM_PROJ ZM_PROJ_A64FXd +#define TM_PROJ TM_PROJ_A64FXd +#define XP_RECON XP_RECON_A64FXd +#define XM_RECON XM_RECON_A64FXd +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXd; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; } +#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } +// DECLARATIONS +#define DECLARATIONS_A64FXd \ + const uint64_t lut[4][8] = { \ + {4, 5, 6, 7, 0, 1, 2, 3}, \ + {2, 3, 0, 1, 6, 7, 4, 5}, \ + {1, 0, 3, 2, 5, 4, 7, 6}, \ + {0, 1, 2, 4, 5, 6, 7, 8} };\ +asm ( \ + "fmov z31.d , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// RESULT +#define RESULT_A64FXd(base) \ +{ \ +asm ( \ + "str z0, [%[storeptr], -6, mul vl] \n\t" \ + "str z1, [%[storeptr], -5, mul vl] \n\t" \ + "str z2, [%[storeptr], -4, mul vl] \n\t" \ + "str z3, [%[storeptr], -3, mul vl] \n\t" \ + "str z4, [%[storeptr], -2, mul vl] \n\t" \ + "str z5, [%[storeptr], -1, mul vl] \n\t" \ + "str z6, [%[storeptr], 0, mul vl] \n\t" \ + "str z7, [%[storeptr], 1, mul vl] \n\t" \ + "str z8, [%[storeptr], 2, mul vl] \n\t" \ + "str z9, [%[storeptr], 3, mul vl] \n\t" \ + "str z10, [%[storeptr], 4, mul vl] \n\t" \ + "str z11, [%[storeptr], 5, mul vl] \n\t" \ + : \ + : [storeptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXd(base) \ +{ \ +asm ( \ + "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ +{ \ +asm ( \ + "ptrue p5.d \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.d \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.d \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_TABLE0 +#define LOAD_TABLE0 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE1 +#define LOAD_TABLE1 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE2 +#define LOAD_TABLE2 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE3 +#define LOAD_TABLE3 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (3) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERMUTE +#define PERMUTE_A64FXd \ +asm ( \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_GAUGE +#define LOAD_GAUGE \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +{ \ +asm ( \ + "ptrue p5.d \n\t" \ + "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// MULT_2SPIN +#define MULT_2SPIN_1_A64FXd(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +asm ( \ + "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "movprfx z18.d, p5/m, z31.d \n\t" \ + "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ + "movprfx z21.d, p5/m, z31.d \n\t" \ + "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \ + "movprfx z19.d, p5/m, z31.d \n\t" \ + "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \ + "movprfx z22.d, p5/m, z31.d \n\t" \ + "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \ + "movprfx z20.d, p5/m, z31.d \n\t" \ + "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \ + "movprfx z23.d, p5/m, z31.d \n\t" \ + "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ + "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXd \ +{ \ +asm ( \ + "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_PROJ +#define XP_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_RECON +#define XP_RECON_A64FXd \ +asm ( \ + "movprfx z6.d, p5/m, z31.d \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ + "movprfx z7.d, p5/m, z31.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ + "movprfx z8.d, p5/m, z31.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ + "movprfx z9.d, p5/m, z31.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ + "movprfx z10.d, p5/m, z31.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ + "movprfx z11.d, p5/m, z31.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ + "mov z0.d, p5/m, z18.d \n\t" \ + "mov z1.d, p5/m, z19.d \n\t" \ + "mov z2.d, p5/m, z20.d \n\t" \ + "mov z3.d, p5/m, z21.d \n\t" \ + "mov z4.d, p5/m, z22.d \n\t" \ + "mov z5.d, p5/m, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_PROJ +#define YP_PROJ_A64FXd \ +{ \ +asm ( \ + "fsub z12.d, p5/m, z12.d, z21.d \n\t" \ + "fsub z13.d, p5/m, z13.d, z22.d \n\t" \ + "fsub z14.d, p5/m, z14.d, z23.d \n\t" \ + "fadd z15.d, p5/m, z15.d, z18.d \n\t" \ + "fadd z16.d, p5/m, z16.d, z19.d \n\t" \ + "fadd z17.d, p5/m, z17.d, z20.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TP_PROJ +#define TP_PROJ_A64FXd \ +{ \ +asm ( \ + "fadd z12.d, p5/m, z12.d, z18.d \n\t" \ + "fadd z13.d, p5/m, z13.d, z19.d \n\t" \ + "fadd z14.d, p5/m, z14.d, z20.d \n\t" \ + "fadd z15.d, p5/m, z15.d, z21.d \n\t" \ + "fadd z16.d, p5/m, z16.d, z22.d \n\t" \ + "fadd z17.d, p5/m, z17.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_PROJ +#define XM_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON +#define XM_RECON_A64FXd \ +asm ( \ + "movprfx z6.d, p5/m, z31.d \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ + "movprfx z7.d, p5/m, z31.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ + "movprfx z8.d, p5/m, z31.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ + "movprfx z9.d, p5/m, z31.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ + "movprfx z10.d, p5/m, z31.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ + "movprfx z11.d, p5/m, z31.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ + "mov z0.d, p5/m, z18.d \n\t" \ + "mov z1.d, p5/m, z19.d \n\t" \ + "mov z2.d, p5/m, z20.d \n\t" \ + "mov z3.d, p5/m, z21.d \n\t" \ + "mov z4.d, p5/m, z22.d \n\t" \ + "mov z5.d, p5/m, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_PROJ +#define YM_PROJ_A64FXd \ +{ \ +asm ( \ + "fadd z12.d, p5/m, z12.d, z21.d \n\t" \ + "fadd z13.d, p5/m, z13.d, z22.d \n\t" \ + "fadd z14.d, p5/m, z14.d, z23.d \n\t" \ + "fsub z15.d, p5/m, z15.d, z18.d \n\t" \ + "fsub z16.d, p5/m, z16.d, z19.d \n\t" \ + "fsub z17.d, p5/m, z17.d, z20.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TM_PROJ +#define TM_PROJ_A64FXd \ +{ \ +asm ( \ + "ptrue p5.d \n\t" \ + "fsub z12.d, p5/m, z12.d, z18.d \n\t" \ + "fsub z13.d, p5/m, z13.d, z19.d \n\t" \ + "fsub z14.d, p5/m, z14.d, z20.d \n\t" \ + "fsub z15.d, p5/m, z15.d, z21.d \n\t" \ + "fsub z16.d, p5/m, z16.d, z22.d \n\t" \ + "fsub z17.d, p5/m, z17.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fsub z9.d, p5/m, z9.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fsub z10.d, p5/m, z10.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fsub z11.d, p5/m, z11.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fsub z6.d, p5/m, z6.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fsub z7.d, p5/m, z7.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fsub z8.d, p5/m, z8.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fsub z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fsub z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fsub z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fsub z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fsub z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fsub z11.d, p5/m, z11.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZERO_PSI +#define ZERO_PSI_A64FXd \ +asm ( \ + "ptrue p5.d \n\t" \ + "fmov z0.d , 0 \n\t" \ + "fmov z1.d , 0 \n\t" \ + "fmov z2.d , 0 \n\t" \ + "fmov z3.d , 0 \n\t" \ + "fmov z4.d , 0 \n\t" \ + "fmov z5.d , 0 \n\t" \ + "fmov z6.d , 0 \n\t" \ + "fmov z7.d , 0 \n\t" \ + "fmov z8.d , 0 \n\t" \ + "fmov z9.d , 0 \n\t" \ + "fmov z10.d , 0 \n\t" \ + "fmov z11.d , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_RESULT_L1_STORE (prefetch store to L1) +#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z12.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z13.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z14.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z15.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z16.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z17.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h new file mode 100644 index 00000000..d809f83b --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -0,0 +1,779 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_asm_single.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) +#define PF_GAUGE(A) +#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A) +#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXf +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf +#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) +#define XP_PROJ XP_PROJ_A64FXf +#define YP_PROJ YP_PROJ_A64FXf +#define ZP_PROJ ZP_PROJ_A64FXf +#define TP_PROJ TP_PROJ_A64FXf +#define XM_PROJ XM_PROJ_A64FXf +#define YM_PROJ YM_PROJ_A64FXf +#define ZM_PROJ ZM_PROJ_A64FXf +#define TM_PROJ TM_PROJ_A64FXf +#define XP_RECON XP_RECON_A64FXf +#define XM_RECON XM_RECON_A64FXf +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXf; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; } +#define MAYBEPERM(A,perm) if (perm) { PERMUTE; } +// DECLARATIONS +#define DECLARATIONS_A64FXf \ + const uint32_t lut[4][16] = { \ + {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ + {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ + {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ +asm ( \ + "fmov z31.s , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// RESULT +#define RESULT_A64FXf(base) \ +{ \ +asm ( \ + "str z0, [%[storeptr], -6, mul vl] \n\t" \ + "str z1, [%[storeptr], -5, mul vl] \n\t" \ + "str z2, [%[storeptr], -4, mul vl] \n\t" \ + "str z3, [%[storeptr], -3, mul vl] \n\t" \ + "str z4, [%[storeptr], -2, mul vl] \n\t" \ + "str z5, [%[storeptr], -1, mul vl] \n\t" \ + "str z6, [%[storeptr], 0, mul vl] \n\t" \ + "str z7, [%[storeptr], 1, mul vl] \n\t" \ + "str z8, [%[storeptr], 2, mul vl] \n\t" \ + "str z9, [%[storeptr], 3, mul vl] \n\t" \ + "str z10, [%[storeptr], 4, mul vl] \n\t" \ + "str z11, [%[storeptr], 5, mul vl] \n\t" \ + : \ + : [storeptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ +{ \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXf(base) \ +{ \ +asm ( \ + "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ +{ \ +asm ( \ + "ptrue p5.s \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.s \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.s \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_TABLE0 +#define LOAD_TABLE0 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE1 +#define LOAD_TABLE1 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE2 +#define LOAD_TABLE2 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE3 +#define LOAD_TABLE3 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (3) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERMUTE +#define PERMUTE_A64FXf \ +asm ( \ + "tbl z12.s, { z12.s }, z30.s \n\t" \ + "tbl z13.s, { z13.s }, z30.s \n\t" \ + "tbl z14.s, { z14.s }, z30.s \n\t" \ + "tbl z15.s, { z15.s }, z30.s \n\t" \ + "tbl z16.s, { z16.s }, z30.s \n\t" \ + "tbl z17.s, { z17.s }, z30.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_GAUGE +#define LOAD_GAUGE \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +{ \ +asm ( \ + "ptrue p5.s \n\t" \ + "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// MULT_2SPIN +#define MULT_2SPIN_1_A64FXf(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +asm ( \ + "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "movprfx z18.s, p5/m, z31.s \n\t" \ + "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ + "movprfx z21.s, p5/m, z31.s \n\t" \ + "fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \ + "movprfx z19.s, p5/m, z31.s \n\t" \ + "fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \ + "movprfx z22.s, p5/m, z31.s \n\t" \ + "fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \ + "movprfx z20.s, p5/m, z31.s \n\t" \ + "fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \ + "movprfx z23.s, p5/m, z31.s \n\t" \ + "fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \ + "fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \ + "fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \ + "fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \ + "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ + "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ + "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ + "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXf \ +{ \ +asm ( \ + "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ + "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ + "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ + "fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \ + "fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \ + "fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \ + "fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \ + "fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \ + "fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \ + "fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \ + "fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \ + "fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \ + "fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \ + "fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \ + "fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \ + "fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \ + "fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \ + "fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \ + "fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \ + "fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \ + "fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \ + "fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \ + "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \ + "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_PROJ +#define XP_PROJ_A64FXf \ +{ \ +asm ( \ + "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_RECON +#define XP_RECON_A64FXf \ +asm ( \ + "movprfx z6.s, p5/m, z31.s \n\t" \ + "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ + "movprfx z7.s, p5/m, z31.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ + "movprfx z8.s, p5/m, z31.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ + "movprfx z9.s, p5/m, z31.s \n\t" \ + "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ + "movprfx z10.s, p5/m, z31.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ + "movprfx z11.s, p5/m, z31.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ + "mov z0.s, p5/m, z18.s \n\t" \ + "mov z1.s, p5/m, z19.s \n\t" \ + "mov z2.s, p5/m, z20.s \n\t" \ + "mov z3.s, p5/m, z21.s \n\t" \ + "mov z4.s, p5/m, z22.s \n\t" \ + "mov z5.s, p5/m, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_PROJ +#define YP_PROJ_A64FXf \ +{ \ +asm ( \ + "fsub z12.s, p5/m, z12.s, z21.s \n\t" \ + "fsub z13.s, p5/m, z13.s, z22.s \n\t" \ + "fsub z14.s, p5/m, z14.s, z23.s \n\t" \ + "fadd z15.s, p5/m, z15.s, z18.s \n\t" \ + "fadd z16.s, p5/m, z16.s, z19.s \n\t" \ + "fadd z17.s, p5/m, z17.s, z20.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXf \ +{ \ +asm ( \ + "fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TP_PROJ +#define TP_PROJ_A64FXf \ +{ \ +asm ( \ + "fadd z12.s, p5/m, z12.s, z18.s \n\t" \ + "fadd z13.s, p5/m, z13.s, z19.s \n\t" \ + "fadd z14.s, p5/m, z14.s, z20.s \n\t" \ + "fadd z15.s, p5/m, z15.s, z21.s \n\t" \ + "fadd z16.s, p5/m, z16.s, z22.s \n\t" \ + "fadd z17.s, p5/m, z17.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_PROJ +#define XM_PROJ_A64FXf \ +{ \ +asm ( \ + "fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON +#define XM_RECON_A64FXf \ +asm ( \ + "movprfx z6.s, p5/m, z31.s \n\t" \ + "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ + "movprfx z7.s, p5/m, z31.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ + "movprfx z8.s, p5/m, z31.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ + "movprfx z9.s, p5/m, z31.s \n\t" \ + "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ + "movprfx z10.s, p5/m, z31.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ + "movprfx z11.s, p5/m, z31.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ + "mov z0.s, p5/m, z18.s \n\t" \ + "mov z1.s, p5/m, z19.s \n\t" \ + "mov z2.s, p5/m, z20.s \n\t" \ + "mov z3.s, p5/m, z21.s \n\t" \ + "mov z4.s, p5/m, z22.s \n\t" \ + "mov z5.s, p5/m, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_PROJ +#define YM_PROJ_A64FXf \ +{ \ +asm ( \ + "fadd z12.s, p5/m, z12.s, z21.s \n\t" \ + "fadd z13.s, p5/m, z13.s, z22.s \n\t" \ + "fadd z14.s, p5/m, z14.s, z23.s \n\t" \ + "fsub z15.s, p5/m, z15.s, z18.s \n\t" \ + "fsub z16.s, p5/m, z16.s, z19.s \n\t" \ + "fsub z17.s, p5/m, z17.s, z20.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXf \ +{ \ +asm ( \ + "fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TM_PROJ +#define TM_PROJ_A64FXf \ +{ \ +asm ( \ + "ptrue p5.s \n\t" \ + "fsub z12.s, p5/m, z12.s, z18.s \n\t" \ + "fsub z13.s, p5/m, z13.s, z19.s \n\t" \ + "fsub z14.s, p5/m, z14.s, z20.s \n\t" \ + "fsub z15.s, p5/m, z15.s, z21.s \n\t" \ + "fsub z16.s, p5/m, z16.s, z22.s \n\t" \ + "fsub z17.s, p5/m, z17.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ + "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ + "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ + "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ + "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ + "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fsub z9.s, p5/m, z9.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fsub z10.s, p5/m, z10.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fsub z11.s, p5/m, z11.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fadd z6.s, p5/m, z6.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fadd z7.s, p5/m, z7.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + "fadd z8.s, p5/m, z8.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fadd z9.s, p5/m, z9.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fadd z10.s, p5/m, z10.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fadd z11.s, p5/m, z11.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fsub z6.s, p5/m, z6.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fsub z7.s, p5/m, z7.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + "fsub z8.s, p5/m, z8.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fsub z6.s, p5/m, z6.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fsub z7.s, p5/m, z7.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fsub z8.s, p5/m, z8.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fsub z9.s, p5/m, z9.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fsub z10.s, p5/m, z10.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + "fsub z11.s, p5/m, z11.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZERO_PSI +#define ZERO_PSI_A64FXf \ +asm ( \ + "ptrue p5.s \n\t" \ + "fmov z0.s , 0 \n\t" \ + "fmov z1.s , 0 \n\t" \ + "fmov z2.s , 0 \n\t" \ + "fmov z3.s , 0 \n\t" \ + "fmov z4.s , 0 \n\t" \ + "fmov z5.s , 0 \n\t" \ + "fmov z6.s , 0 \n\t" \ + "fmov z7.s , 0 \n\t" \ + "fmov z8.s , 0 \n\t" \ + "fmov z9.s , 0 \n\t" \ + "fmov z10.s , 0 \n\t" \ + "fmov z11.s , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_RESULT_L1_STORE (prefetch store to L1) +#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXf \ +asm ( \ + "fadd z0.s, p5/m, z0.s, z12.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z13.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z14.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z15.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z16.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z17.s \n\t" \ + "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ + "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ + "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ + "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ + "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ + "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h new file mode 100644 index 00000000..232610f2 --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -0,0 +1,601 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_intrin_double.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) +#define PF_GAUGE(A) +#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A) +#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXd +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd +#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) +#define XP_PROJ XP_PROJ_A64FXd +#define YP_PROJ YP_PROJ_A64FXd +#define ZP_PROJ ZP_PROJ_A64FXd +#define TP_PROJ TP_PROJ_A64FXd +#define XM_PROJ XM_PROJ_A64FXd +#define YM_PROJ YM_PROJ_A64FXd +#define ZM_PROJ ZM_PROJ_A64FXd +#define TM_PROJ TM_PROJ_A64FXd +#define XP_RECON XP_RECON_A64FXd +#define XM_RECON XM_RECON_A64FXd +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXd; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; } +#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } +// DECLARATIONS +#define DECLARATIONS_A64FXd \ + const uint64_t lut[4][8] = { \ + {4, 5, 6, 7, 0, 1, 2, 3}, \ + {2, 3, 0, 1, 6, 7, 4, 5}, \ + {1, 0, 3, 2, 5, 4, 7, 6}, \ + {0, 1, 2, 4, 5, 6, 7, 8} };\ + svfloat64_t result_00; \ + svfloat64_t result_01; \ + svfloat64_t result_02; \ + svfloat64_t result_10; \ + svfloat64_t result_11; \ + svfloat64_t result_12; \ + svfloat64_t result_20; \ + svfloat64_t result_21; \ + svfloat64_t result_22; \ + svfloat64_t result_30; \ + svfloat64_t result_31; \ + svfloat64_t result_32; \ + svfloat64_t Chi_00; \ + svfloat64_t Chi_01; \ + svfloat64_t Chi_02; \ + svfloat64_t Chi_10; \ + svfloat64_t Chi_11; \ + svfloat64_t Chi_12; \ + svfloat64_t UChi_00; \ + svfloat64_t UChi_01; \ + svfloat64_t UChi_02; \ + svfloat64_t UChi_10; \ + svfloat64_t UChi_11; \ + svfloat64_t UChi_12; \ + svfloat64_t U_00; \ + svfloat64_t U_10; \ + svfloat64_t U_20; \ + svfloat64_t U_01; \ + svfloat64_t U_11; \ + svfloat64_t U_21; \ + svbool_t pg1; \ + pg1 = svptrue_b64(); \ + svuint64_t table0; \ + svfloat64_t zero0; \ + zero0 = svdup_f64(0.); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 +// RESULT +#define RESULT_A64FXd(base) \ +{ \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXd(base) \ +{ \ + Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64)); \ + Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64)); \ + Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64)); \ + Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64)); \ + Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64)); \ + Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64)); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ +{ \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_TABLE0 +#define LOAD_TABLE0 \ + table0 = svld1(pg1, (uint64_t*)&lut[0]); + +// LOAD_TABLE1 +#define LOAD_TABLE1 \ + table0 = svld1(pg1, (uint64_t*)&lut[1]); + +// LOAD_TABLE2 +#define LOAD_TABLE2 \ + table0 = svld1(pg1, (uint64_t*)&lut[2]); + +// LOAD_TABLE3 +#define LOAD_TABLE3 \ + table0 = svld1(pg1, (uint64_t*)&lut[3]); + +// PERMUTE +#define PERMUTE_A64FXd \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// LOAD_GAUGE +#define LOAD_GAUGE \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +{ \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ +} +// MULT_2SPIN +#define MULT_2SPIN_1_A64FXd(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ + UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ + UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ + UChi_11 = svcmla_x(pg1, zero0, U_10, Chi_10, 0); \ + UChi_02 = svcmla_x(pg1, zero0, U_20, Chi_00, 0); \ + UChi_12 = svcmla_x(pg1, zero0, U_20, Chi_10, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXd \ +{ \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \ +} +// XP_PROJ +#define XP_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \ +} +// XP_RECON +#define XP_RECON_A64FXd \ + result_20 = svcadd_x(pg1, zero0, UChi_10, 270); \ + result_21 = svcadd_x(pg1, zero0, UChi_11, 270); \ + result_22 = svcadd_x(pg1, zero0, UChi_12, 270); \ + result_30 = svcadd_x(pg1, zero0, UChi_00, 270); \ + result_31 = svcadd_x(pg1, zero0, UChi_01, 270); \ + result_32 = svcadd_x(pg1, zero0, UChi_02, 270); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXd \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_PROJ +#define YP_PROJ_A64FXd \ +{ \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \ +} +// TP_PROJ +#define TP_PROJ_A64FXd \ +{ \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_PROJ +#define XM_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \ +} +// XM_RECON +#define XM_RECON_A64FXd \ + result_20 = svcadd_x(pg1, zero0, UChi_10, 90); \ + result_21 = svcadd_x(pg1, zero0, UChi_11, 90); \ + result_22 = svcadd_x(pg1, zero0, UChi_12, 90); \ + result_30 = svcadd_x(pg1, zero0, UChi_00, 90); \ + result_31 = svcadd_x(pg1, zero0, UChi_01, 90); \ + result_32 = svcadd_x(pg1, zero0, UChi_02, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YM_PROJ +#define YM_PROJ_A64FXd \ +{ \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \ +} +// TM_PROJ +#define TM_PROJ_A64FXd \ +{ \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXd \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svsub_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svsub_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svsub_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svadd_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svadd_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svadd_x(pg1, result_22, UChi_12); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svadd_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svadd_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svadd_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svsub_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svsub_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svsub_x(pg1, result_22, UChi_12); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svadd_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svadd_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svadd_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svadd_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svadd_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svadd_x(pg1, result_32, UChi_12); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svsub_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svsub_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svsub_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svsub_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svsub_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svsub_x(pg1, result_32, UChi_12); + +// ZERO_PSI +#define ZERO_PSI_A64FXd \ + result_00 = svdup_f64(0.); \ + result_01 = svdup_f64(0.); \ + result_02 = svdup_f64(0.); \ + result_10 = svdup_f64(0.); \ + result_11 = svdup_f64(0.); \ + result_12 = svdup_f64(0.); \ + result_20 = svdup_f64(0.); \ + result_21 = svdup_f64(0.); \ + result_22 = svdup_f64(0.); \ + result_30 = svdup_f64(0.); \ + result_31 = svdup_f64(0.); \ + result_32 = svdup_f64(0.); + +// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ +} +// PREFETCH_RESULT_L1_STORE (prefetch store to L1) +#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PSTL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PSTL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PSTL1STRM); \ +} +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXd \ + result_00 = svadd_x(pg1, result_00, Chimu_00); \ + result_01 = svadd_x(pg1, result_01, Chimu_01); \ + result_02 = svadd_x(pg1, result_02, Chimu_02); \ + result_10 = svadd_x(pg1, result_10, Chimu_10); \ + result_11 = svadd_x(pg1, result_11, Chimu_11); \ + result_12 = svadd_x(pg1, result_12, Chimu_12); \ + result_20 = svadd_x(pg1, result_20, Chimu_20); \ + result_21 = svadd_x(pg1, result_21, Chimu_21); \ + result_22 = svadd_x(pg1, result_22, Chimu_22); \ + result_30 = svadd_x(pg1, result_30, Chimu_30); \ + result_31 = svadd_x(pg1, result_31, Chimu_31); \ + result_32 = svadd_x(pg1, result_32, Chimu_32); + diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h new file mode 100644 index 00000000..180e5f4f --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -0,0 +1,601 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_intrin_single.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) +#define PF_GAUGE(A) +#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A) +#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXf +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf +#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) +#define XP_PROJ XP_PROJ_A64FXf +#define YP_PROJ YP_PROJ_A64FXf +#define ZP_PROJ ZP_PROJ_A64FXf +#define TP_PROJ TP_PROJ_A64FXf +#define XM_PROJ XM_PROJ_A64FXf +#define YM_PROJ YM_PROJ_A64FXf +#define ZM_PROJ ZM_PROJ_A64FXf +#define TM_PROJ TM_PROJ_A64FXf +#define XP_RECON XP_RECON_A64FXf +#define XM_RECON XM_RECON_A64FXf +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXf; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; } +#define MAYBEPERM(A,perm) if (perm) { PERMUTE; } +// DECLARATIONS +#define DECLARATIONS_A64FXf \ + const uint32_t lut[4][16] = { \ + {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ + {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ + {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ + svfloat32_t result_00; \ + svfloat32_t result_01; \ + svfloat32_t result_02; \ + svfloat32_t result_10; \ + svfloat32_t result_11; \ + svfloat32_t result_12; \ + svfloat32_t result_20; \ + svfloat32_t result_21; \ + svfloat32_t result_22; \ + svfloat32_t result_30; \ + svfloat32_t result_31; \ + svfloat32_t result_32; \ + svfloat32_t Chi_00; \ + svfloat32_t Chi_01; \ + svfloat32_t Chi_02; \ + svfloat32_t Chi_10; \ + svfloat32_t Chi_11; \ + svfloat32_t Chi_12; \ + svfloat32_t UChi_00; \ + svfloat32_t UChi_01; \ + svfloat32_t UChi_02; \ + svfloat32_t UChi_10; \ + svfloat32_t UChi_11; \ + svfloat32_t UChi_12; \ + svfloat32_t U_00; \ + svfloat32_t U_10; \ + svfloat32_t U_20; \ + svfloat32_t U_01; \ + svfloat32_t U_11; \ + svfloat32_t U_21; \ + svbool_t pg1; \ + pg1 = svptrue_b32(); \ + svuint32_t table0; \ + svfloat32_t zero0; \ + zero0 = svdup_f32(0.); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 +// RESULT +#define RESULT_A64FXf(base) \ +{ \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ +{ \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXf(base) \ +{ \ + Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64)); \ + Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64)); \ + Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64)); \ + Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64)); \ + Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64)); \ + Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64)); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ +{ \ + Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_TABLE0 +#define LOAD_TABLE0 \ + table0 = svld1(pg1, (uint32_t*)&lut[0]); + +// LOAD_TABLE1 +#define LOAD_TABLE1 \ + table0 = svld1(pg1, (uint32_t*)&lut[1]); + +// LOAD_TABLE2 +#define LOAD_TABLE2 \ + table0 = svld1(pg1, (uint32_t*)&lut[2]); + +// LOAD_TABLE3 +#define LOAD_TABLE3 \ + table0 = svld1(pg1, (uint32_t*)&lut[3]); + +// PERMUTE +#define PERMUTE_A64FXf \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// LOAD_GAUGE +#define LOAD_GAUGE \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +{ \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ +} +// MULT_2SPIN +#define MULT_2SPIN_1_A64FXf(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ + UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ + UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ + UChi_11 = svcmla_x(pg1, zero0, U_10, Chi_10, 0); \ + UChi_02 = svcmla_x(pg1, zero0, U_20, Chi_00, 0); \ + UChi_12 = svcmla_x(pg1, zero0, U_20, Chi_10, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXf \ +{ \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \ +} +// XP_PROJ +#define XP_PROJ_A64FXf \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \ +} +// XP_RECON +#define XP_RECON_A64FXf \ + result_20 = svcadd_x(pg1, zero0, UChi_10, 270); \ + result_21 = svcadd_x(pg1, zero0, UChi_11, 270); \ + result_22 = svcadd_x(pg1, zero0, UChi_12, 270); \ + result_30 = svcadd_x(pg1, zero0, UChi_00, 270); \ + result_31 = svcadd_x(pg1, zero0, UChi_01, 270); \ + result_32 = svcadd_x(pg1, zero0, UChi_02, 270); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXf \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_PROJ +#define YP_PROJ_A64FXf \ +{ \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXf \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \ +} +// TP_PROJ +#define TP_PROJ_A64FXf \ +{ \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_PROJ +#define XM_PROJ_A64FXf \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \ +} +// XM_RECON +#define XM_RECON_A64FXf \ + result_20 = svcadd_x(pg1, zero0, UChi_10, 90); \ + result_21 = svcadd_x(pg1, zero0, UChi_11, 90); \ + result_22 = svcadd_x(pg1, zero0, UChi_12, 90); \ + result_30 = svcadd_x(pg1, zero0, UChi_00, 90); \ + result_31 = svcadd_x(pg1, zero0, UChi_01, 90); \ + result_32 = svcadd_x(pg1, zero0, UChi_02, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YM_PROJ +#define YM_PROJ_A64FXf \ +{ \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXf \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \ +} +// TM_PROJ +#define TM_PROJ_A64FXf \ +{ \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXf \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svsub_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svsub_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svsub_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svadd_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svadd_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svadd_x(pg1, result_22, UChi_12); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svadd_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svadd_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svadd_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svsub_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svsub_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svsub_x(pg1, result_22, UChi_12); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXf \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXf \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svadd_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svadd_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svadd_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svadd_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svadd_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svadd_x(pg1, result_32, UChi_12); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svsub_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svsub_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svsub_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svsub_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svsub_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svsub_x(pg1, result_32, UChi_12); + +// ZERO_PSI +#define ZERO_PSI_A64FXf \ + result_00 = svdup_f32(0.); \ + result_01 = svdup_f32(0.); \ + result_02 = svdup_f32(0.); \ + result_10 = svdup_f32(0.); \ + result_11 = svdup_f32(0.); \ + result_12 = svdup_f32(0.); \ + result_20 = svdup_f32(0.); \ + result_21 = svdup_f32(0.); \ + result_22 = svdup_f32(0.); \ + result_30 = svdup_f32(0.); \ + result_31 = svdup_f32(0.); \ + result_32 = svdup_f32(0.); + +// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ +} +// PREFETCH_RESULT_L1_STORE (prefetch store to L1) +#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PSTL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PSTL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PSTL1STRM); \ +} +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXf \ + result_00 = svadd_x(pg1, result_00, Chimu_00); \ + result_01 = svadd_x(pg1, result_01, Chimu_01); \ + result_02 = svadd_x(pg1, result_02, Chimu_02); \ + result_10 = svadd_x(pg1, result_10, Chimu_10); \ + result_11 = svadd_x(pg1, result_11, Chimu_11); \ + result_12 = svadd_x(pg1, result_12, Chimu_12); \ + result_20 = svadd_x(pg1, result_20, Chimu_20); \ + result_21 = svadd_x(pg1, result_21, Chimu_21); \ + result_22 = svadd_x(pg1, result_22, Chimu_22); \ + result_30 = svadd_x(pg1, result_30, Chimu_30); \ + result_31 = svadd_x(pg1, result_31, Chimu_31); \ + result_32 = svadd_x(pg1, result_32, Chimu_32); + diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h new file mode 100644 index 00000000..81eec37a --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -0,0 +1,76 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_undef.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#undef LOAD_CHIMU +#undef PREFETCH_CHIMU_L1 +#undef PREFETCH_GAUGE_L1 +#undef PREFETCH_CHIMU_L2 +#undef PREFETCH_GAUGE_L2 +#undef PREFETCH_GAUGE_L1_INTERNAL +#undef PREFETCH1_CHIMU +#undef PREFETCH_CHIMU +#undef PREFETCH_RESULT_L2_STORE +#undef PREFETCH_RESULT_L1_STORE +#undef LOAD_GAUGE +#undef LOCK_GAUGE +#undef UNLOCK_GAUGE +#undef MASK_REGS +#undef SAVE_RESULT +#undef ADD_RESULT +#undef MULT_2SPIN_1 +#undef MULT_2SPIN_2 +#undef MAYBEPERM +#undef LOAD_CHI +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef XP_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef PERMUTE +#undef PERMUTE_DIR0 +#undef PERMUTE_DIR1 +#undef PERMUTE_DIR2 +#undef PERMUTE_DIR3 +#undef LOAD_TABLE +#undef LOAD_TABLE0 +#undef LOAD_TABLE1 +#undef LOAD_TABLE2 +#undef LOAD_TABLE3 diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h new file mode 100644 index 00000000..2ad8591c --- /dev/null +++ b/Grid/simd/Grid_a64fx-2.h @@ -0,0 +1,942 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Grid_a64fx-2.h + + Copyright (C) 2020 + + Author: Nils Meyer + + with support from Arm + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +///////////////////////////////////////////////////// +// Using SVE ACLE +///////////////////////////////////////////////////// + +static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); + +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); + + // type traits giving the number of elements for each vector type + template struct W; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/8u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + template <> struct W { + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; + }; + + #ifdef ARMCLANGCOMPAT + // SIMD vector immediate types + template + struct vec_imm { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + }; + + // SIMD vector types + template + struct vec { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + vec() = default; + vec(const vec &rhs) { this->operator=(rhs); } + vec(const vec_imm &rhs) { + // v = rhs.v + svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v)); + } + + inline vec &operator=(const vec &rhs) { + // v = rhs.v + svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v)); + return *this; + }; + }; + + #else // no ARMCLANGCOMPAT + #define vec_imm vec + // SIMD vector types + template + struct vec { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + }; + #endif + + typedef vec vecf; + typedef vec vecd; + typedef vec vech; // half precision comms + typedef vec veci; + +NAMESPACE_END(Optimization) +NAMESPACE_END(Grid) + +// low-level API +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); + +template +struct acle{}; + +template <> +struct acle{ + typedef svfloat64_t vt; + typedef svfloat64x2_t vt2; + typedef svfloat64x4_t vt4; + typedef float64_t pt; + typedef uint64_t uint; + typedef svuint64_t svuint; + + static inline svbool_t pg1(){return svptrue_b64();} + static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);} + static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);} + static inline vec tbl_swap(){ + //const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; + const vec_imm t = {1, 0, 3, 2, 5, 4, 7, 6}; + return t; + } + static inline vec tbl0(){ + //const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; + const vec_imm t = {4, 5, 6, 7, 0, 1, 2, 3}; + return t; + } + static inline vec tbl1(){ + //const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; + const vec_imm t = {2, 3, 0, 1, 6, 7, 4, 5}; + return t; + } + static inline vec tbl_exch1a(){ // Exchange1 + //const vec t = {0, 1, 4, 5, 2, 3, 6, 7}; + const vec_imm t = {0, 1, 4, 5, 2, 3, 6, 7}; + return t; + } + static inline vec tbl_exch1b(){ // Exchange1 + //const vec t = {2, 3, 6, 7, 0, 1, 4, 5}; + const vec_imm t = {2, 3, 6, 7, 0, 1, 4, 5}; + return t; + } + static inline vec tbl_exch1c(){ // Exchange1 + //const vec t = {4, 5, 0, 1, 6, 7, 2, 3}; + const vec_imm t = {4, 5, 0, 1, 6, 7, 2, 3}; + return t; + } + static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} + static inline svfloat64_t zero(){return svdup_f64(0.);} +}; + +template <> +struct acle{ + typedef svfloat32_t vt; + typedef svfloat32x2_t vt2; + typedef float32_t pt; + typedef uint32_t uint; + typedef svuint32_t svuint; + + static inline svbool_t pg1(){return svptrue_b32();} + static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} + // exchange neighboring elements + static inline vec tbl_swap(){ + //const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + const vec_imm t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + return t; + } + static inline vec tbl0(){ + //const vec t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + const vec_imm t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + return t; + } + static inline vec tbl1(){ + //const vec t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + const vec_imm t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + return t; + } + static inline vec tbl2(){ + //const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + const vec_imm t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + return t; + } + static inline vec tbl_exch1a(){ // Exchange1 + //const vec t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 }; + const vec_imm t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 }; + return t; + } + static inline vec tbl_exch1b(){ // Exchange1 + //const vec t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 }; + const vec_imm t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 }; + return t; + } + static inline vec tbl_exch1c(){ // Exchange1 + //const vec t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7}; + const vec_imm t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7}; + return t; + } + static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} + static inline svfloat32_t zero(){return svdup_f32(0.);} +}; + +template <> +struct acle{ + typedef svfloat16_t vt; + typedef float16_t pt; + typedef uint16_t uint; + typedef svuint16_t svuint; + + static inline svbool_t pg1(){return svptrue_b16();} + static inline svbool_t pg2(){return svptrue_pat_b16(SV_VL16);} + static inline svbool_t pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());} + static inline svfloat16_t zero(){return svdup_f16(0.);} +}; + +template <> +struct acle{ + typedef svuint32_t vt; + typedef svuint32x2_t vt2; + typedef Integer pt; + typedef uint32_t uint; + typedef svuint32_t svuint; + + //static inline svbool_t pg1(){return svptrue_b16();} + static inline svbool_t pg1(){return svptrue_b32();} + static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} + static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} +}; + +// --------------------------------------------------- + +struct Vsplat{ + // Complex float + inline vecf operator()(float a, float b){ + vecf out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svdup_f32(a); + typename acle::vt b_v = svdup_f32(b); + typename acle::vt r_v = svzip1(a_v, b_v); + svst1(pg1, out.v, r_v); + return out; + } + + // Real float + inline vecf operator()(float a){ + vecf out; + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svdup_f32(a); + svst1(pg1, out.v, r_v); + return out; + } + + // Complex double + inline vecd operator()(double a, double b){ + vecd out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svdup_f64(a); + typename acle::vt b_v = svdup_f64(b); + typename acle::vt r_v = svzip1(a_v, b_v); + svst1(pg1, out.v, r_v); + return out; + } + + // Real double + inline vecd operator()(double a){ + vecd out; + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svdup_f64(a); + svst1(pg1, out.v, r_v); + return out; + } + + // Integer + inline vec operator()(Integer a){ + vec out; + svbool_t pg1 = acle::pg1(); + // Add check whether Integer is really a uint32_t??? + typename acle::vt r_v = svdup_u32(a); + svst1(pg1, out.v, r_v); + return out; + } +}; + +struct Vstore{ + // Real + template + inline void operator()(vec a, T *D){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); + svst1(pg1, D, a_v); + } +}; + +struct Vstream{ + // Real + template + inline void operator()(T * a, vec b){ + svbool_t pg1 = acle::pg1(); + typename acle::vt b_v = svld1(pg1, b.v); + svstnt1(pg1, a, b_v); + //svst1(pg1, a, b_v); + } +}; + + struct Vset{ + // Complex + template + inline vec operator()(std::complex *a){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, (T*)a); + svst1(pg1, out.v, a_v); + + return out; + } + + // Real + template + inline vec operator()(T *a){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a); + svst1(pg1, out.v, a_v); + + return out; + } + }; + +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// + +struct Sum{ + template + inline vec operator()(vec a, vec b){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svadd_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct Sub{ + template + inline vec operator()(vec a, vec b){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svsub_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct Mult{ + template + inline vec operator()(vec a, vec b, vec c){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt c_v = svld1(pg1, c.v); + typename acle::vt r_v = svmla_x(pg1, c_v, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } + template + inline vec operator()(vec a, vec b){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svmul_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MultRealPart{ + template + inline vec operator()(vec a, vec b){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + + // using FCMLA + typename acle::vt z_v = acle::zero(); + typename acle::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MaddRealPart{ + template + inline vec operator()(vec a, vec b, vec c){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt c_v = svld1(pg1, c.v); + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MultComplex{ + // Complex a*b + template + inline vec operator()(vec a, vec b){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt z_v = acle::zero(); + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0); + r_v = svcmla_x(pg1, r_v, a_v, b_v, 90); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MultAddComplex{ + // Complex a*b+c + template + inline vec operator()(vec a, vec b, vec c){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt c_v = svld1(pg1, c.v);; + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0); + r_v = svcmla_x(pg1, r_v, a_v, b_v, 90); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct Div{ + // Real + template + inline vec operator()(vec a, vec b){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svdiv_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct Conj{ + // Complex + template + inline vec operator()(vec a){ + vec out; + svbool_t pg1 = acle::pg1(); + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt a_v = svld1(pg1, a.v); + //typename acle::vt r_v = svneg_x(pg_odd, a_v); + typename acle::vt r_v = svneg_m(a_v, pg_odd, a_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct TimesMinusI{ + // Complex + template + inline vec operator()(vec a, vec b){ + vec out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_odd = acle::pg_odd(); + + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_m(a_v, pg_odd, a_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct TimesI{ + // Complex + template + inline vec operator()(vec a, vec b){ + vec out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + //typename acle::vt r_v = svneg_x(pg_even, a_v); + typename acle::vt r_v = svneg_m(a_v, pg_even, a_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct PrecisionChange { + static inline vech StoH (const vecf &sa,const vecf &sb) { + vech ret; + svbool_t pg1s = acle::pg1(); + svbool_t pg1h = acle::pg1(); + typename acle::vt sa_v = svld1(pg1s, sa.v); + typename acle::vt sb_v = svld1(pg1s, sb.v); + typename acle::vt ha_v = svcvt_f16_x(pg1s, sa_v); + typename acle::vt hb_v = svcvt_f16_x(pg1s, sb_v); + typename acle::vt r_v = svuzp1(ha_v, hb_v); + svst1(pg1h, (typename acle::pt*)&ret.v, r_v); + + return ret; + } + static inline void HtoS(vech h,vecf &sa,vecf &sb) { + svbool_t pg1h = acle::pg1(); + svbool_t pg1s = acle::pg1(); + typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); + typename acle::vt ha_v = svzip1(h_v, h_v); + typename acle::vt hb_v = svzip2(h_v, h_v); + typename acle::vt sa_v = svcvt_f32_x(pg1s, ha_v); + typename acle::vt sb_v = svcvt_f32_x(pg1s, hb_v); + svst1(pg1s, sa.v, sa_v); + svst1(pg1s, sb.v, sb_v); + } + static inline vecf DtoS (vecd a,vecd b) { + vecf ret; + svbool_t pg1d = acle::pg1(); + svbool_t pg1s = acle::pg1(); + typename acle::vt a_v = svld1(pg1d, a.v); + typename acle::vt b_v = svld1(pg1d, b.v); + typename acle::vt sa_v = svcvt_f32_x(pg1d, a_v); + typename acle::vt sb_v = svcvt_f32_x(pg1d, b_v); + typename acle::vt r_v = svuzp1(sa_v, sb_v); + svst1(pg1s, ret.v, r_v); + + return ret; + } + static inline void StoD (vecf s,vecd &a,vecd &b) { + svbool_t pg1s = acle::pg1(); + svbool_t pg1d = acle::pg1(); + typename acle::vt s_v = svld1(pg1s, s.v); + typename acle::vt sa_v = svzip1(s_v, s_v); + typename acle::vt sb_v = svzip2(s_v, s_v); + typename acle::vt a_v = svcvt_f64_x(pg1d, sa_v); + typename acle::vt b_v = svcvt_f64_x(pg1d, sb_v); + svst1(pg1d, a.v, a_v); + svst1(pg1d, b.v, b_v); + } + static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { + vech ret; + svbool_t pg1d = acle::pg1(); + svbool_t pg1h = acle::pg1(); + typename acle::vt a_v = svld1(pg1d, a.v); + typename acle::vt b_v = svld1(pg1d, b.v); + typename acle::vt c_v = svld1(pg1d, c.v); + typename acle::vt d_v = svld1(pg1d, d.v); + typename acle::vt ha_v = svcvt_f16_x(pg1d, a_v); + typename acle::vt hb_v = svcvt_f16_x(pg1d, b_v); + typename acle::vt hc_v = svcvt_f16_x(pg1d, c_v); + typename acle::vt hd_v = svcvt_f16_x(pg1d, d_v); + typename acle::vt hab_v = svuzp1(ha_v, hb_v); + typename acle::vt hcd_v = svuzp1(hc_v, hd_v); + typename acle::vt r_v = svuzp1(hab_v, hcd_v); + svst1(pg1h, (typename acle::pt*)&ret.v, r_v); + + return ret; +/* + vecf sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); +*/ + } + static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + svbool_t pg1h = acle::pg1(); + svbool_t pg1d = acle::pg1(); + typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); + typename acle::vt sa_v = svzip1(h_v, h_v); + typename acle::vt sb_v = svzip2(h_v, h_v); + typename acle::vt da_v = svzip1(sa_v, sa_v); + typename acle::vt db_v = svzip2(sa_v, sa_v); + typename acle::vt dc_v = svzip1(sb_v, sb_v); + typename acle::vt dd_v = svzip2(sb_v, sb_v); + typename acle::vt a_v = svcvt_f64_x(pg1d, da_v); + typename acle::vt b_v = svcvt_f64_x(pg1d, db_v); + typename acle::vt c_v = svcvt_f64_x(pg1d, dc_v); + typename acle::vt d_v = svcvt_f64_x(pg1d, dd_v); + svst1(pg1d, a.v, a_v); + svst1(pg1d, b.v, b_v); + svst1(pg1d, c.v, c_v); + svst1(pg1d, d.v, d_v); +/* + vecf sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); +*/ + } +}; + +struct Exchange{ + + // Exchange0 is valid for arbitrary SVE vector length + template + static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svext(a1_v, a1_v, (uint64_t)W::c); + r1_v = svext(r1_v, a2_v, (uint64_t)W::c); + typename acle::vt r2_v = svext(a2_v, a2_v, (uint64_t)W::c); + r2_v = svext(a1_v, r2_v, (uint64_t)W::c); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } + + template + static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ + // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 + // alternative: use 4-el structure; expect translation into ldp + stp -> SFI + svbool_t pg1 = acle::pg1(); + const vec::uint> tbl_exch1a = acle::tbl_exch1a(); + const vec::uint> tbl_exch1b = acle::tbl_exch1b(); + const vec::uint> tbl_exch1c = acle::tbl_exch1c(); + + typename acle::svuint tbl_exch1a_v = svld1(pg1, tbl_exch1a.v); + typename acle::svuint tbl_exch1b_v = svld1(pg1, tbl_exch1b.v); + typename acle::svuint tbl_exch1c_v = svld1(pg1, tbl_exch1c.v); + + typename acle::vt in1_v = svld1(pg1, in1.v); + typename acle::vt in2_v = svld1(pg1, in2.v); + + typename acle::vt a1_v = svtbl(in1_v, tbl_exch1a_v); + typename acle::vt a2_v = svtbl(in2_v, tbl_exch1b_v); + typename acle::vt b1_v = svext(a2_v, a1_v, (uint64_t)(W::r / 2u)); + typename acle::vt b2_v = svext(a1_v, a2_v, (uint64_t)(W::r / 2u)); + typename acle::vt out1_v = svtbl(b1_v, tbl_exch1c_v); + typename acle::vt out2_v = svtbl(b2_v, tbl_exch1a_v); + + svst1(pg1, out1.v, out1_v); + svst1(pg1, out2.v, out2_v); + } + + template + static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, (typename acle::pt*)in1.v); + typename acle::vt a2_v = svld1(pg1, (typename acle::pt*)in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, (typename acle::pt*)out1.v, r1_v); + svst1(pg1, (typename acle::pt*)out2.v, r2_v); + } + + static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } + + static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){ + assert(0); + return; + } +}; + +struct Permute{ + + // Permute0 is valid for any SVE vector width + template + static inline vec Permute0(vec in) { + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(W::r / 2u)); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecd Permute1(vecd in) { + vecd out; + const vec::uint> tbl_swap = acle::tbl1(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecf Permute1(vecf in) { + vecf out; + const vec::uint> tbl_swap = acle::tbl1(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecd Permute2(vecd in) { + vecd out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecf Permute2(vecf in) { + vecf out; + const vec::uint> tbl_swap = acle::tbl2(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecf Permute3(vecf in) { + vecf out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecd Permute3(vecd in) { + return in; + } + +}; + +struct Rotate{ + + template static inline vec tRotate(vec in){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(n%W::r)); + svst1(pg1, out.v, r_v); + + return out; + } + + template + static inline vec rotate(vec in, int n){ + + switch(n){ + case 0: return tRotate<0, T>(in); break; + case 1: return tRotate<1, T>(in); break; + case 2: return tRotate<2, T>(in); break; + case 3: return tRotate<3, T>(in); break; + case 4: return tRotate<4, T>(in); break; + case 5: return tRotate<5, T>(in); break; + case 6: return tRotate<6, T>(in); break; + case 7: return tRotate<7, T>(in); break; + + case 8: return tRotate<8, T>(in); break; + case 9: return tRotate<9, T>(in); break; + case 10: return tRotate<10, T>(in); break; + case 11: return tRotate<11, T>(in); break; + case 12: return tRotate<12, T>(in); break; + case 13: return tRotate<13, T>(in); break; + case 14: return tRotate<14, T>(in); break; + case 15: return tRotate<15, T>(in); break; + default: assert(0); + } + } +}; + +// tree-based reduction +#define svred(pg, v)\ +svaddv(pg, v); + +// left-to-right reduction +// #define svred(pg, v)\ +// svadda(pg, 0, v) + +template +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; + +//Complex float Reduce +template <> +inline Grid::ComplexF Reduce::operator()(vecf in){ + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt a_v = svld1(pg1, in.v); + float a = svred(pg_even, a_v); + float b = svred(pg_odd, a_v); + + return Grid::ComplexF(a, b); + +} + +//Real float Reduce +template <> +inline Grid::RealF Reduce::operator()(vecf in){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + float a = svred(pg1, a_v); + + return a; +} + +//Complex double Reduce +template <> +inline Grid::ComplexD Reduce::operator()(vecd in){ + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt a_v = svld1(pg1, in.v); + double a = svred(pg_even, a_v); + double b = svred(pg_odd, a_v); + + return Grid::ComplexD(a, b); +} + +//Real double Reduce +template <> +inline Grid::RealD Reduce::operator()(vecd in){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + double a = svred(pg1, a_v); + + return a; +} + +//Integer Reduce +template <> +inline Integer Reduce::operator()(veci in){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + Integer a = svred(pg1, a_v); + + return a; +} + +#undef svred +#undef vec_imm + +NAMESPACE_END(Optimization) + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types + +typedef Optimization::vech SIMD_Htype; // Reduced precision type +typedef Optimization::vecf SIMD_Ftype; // Single precision type +typedef Optimization::vecd SIMD_Dtype; // Double precision type +typedef Optimization::veci SIMD_Itype; // Integer type + +// prefetch utilities +inline void v_prefetch0(int size, const char *ptr){}; +inline void prefetch_HINT_T0(const char *ptr){}; + +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; + +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultAddComplex MultAddComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; + +NAMESPACE_END(Grid) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h new file mode 100644 index 00000000..6b450012 --- /dev/null +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -0,0 +1,769 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Grid_a64fx-fixedsize.h + + Copyright (C) 2020 + + Author: Nils Meyer Regensburg University + + with support from Arm + Richard Sandiford + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +///////////////////////////////////////////////////// +// Using SVE ACLE with fixed-size data types +///////////////////////////////////////////////////// + + +// gcc 10 features +#if __ARM_FEATURE_SVE_BITS==512 +/* gcc 10.0.1 and gcc 10.1 bug using ACLE data types CAS-159553-Y1K4C6 + workaround: use gcc's internal data types, bugfix expected for gcc 10.2 +typedef svbool_t pred __attribute__((arm_sve_vector_bits(512))); +typedef svfloat16_t vech __attribute__((arm_sve_vector_bits(512))); +typedef svfloat32_t vecf __attribute__((arm_sve_vector_bits(512))); +typedef svfloat64_t vecd __attribute__((arm_sve_vector_bits(512))); +typedef svuint32_t veci __attribute__((arm_sve_vector_bits(512))); +typedef svuint32_t lutf __attribute__((arm_sve_vector_bits(512))); // LUTs for float +typedef svuint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for double +*/ +typedef __SVBool_t pred __attribute__((arm_sve_vector_bits(512))); +typedef __SVFloat16_t vech __attribute__((arm_sve_vector_bits(512))); +typedef __SVFloat32_t vecf __attribute__((arm_sve_vector_bits(512))); +typedef __SVFloat64_t vecd __attribute__((arm_sve_vector_bits(512))); +typedef __SVUint32_t veci __attribute__((arm_sve_vector_bits(512))); +typedef __SVUint32_t lutf __attribute__((arm_sve_vector_bits(512))); // LUTs for float +typedef __SVUint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for double +#else +#pragma error("Oops. Illegal SVE vector size!?") +#endif /* __ARM_FEATURE_SVE_BITS */ + +// low-level API +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); + +// convenience union types for tables eliminating loads +union ulutf { + lutf v; + uint32_t s[16]; +}; +union ulutd { + lutd v; + uint64_t s[8]; +}; + +template +struct acle{}; + +template <> +struct acle{ + static inline lutd tbl_swap(){ + const ulutd t = { .s = {1, 0, 3, 2, 5, 4, 7, 6} }; + return t.v; + } + static inline lutd tbl0(){ + const ulutd t = { .s = {4, 5, 6, 7, 0, 1, 2, 3} }; + return t.v; + } + static inline lutd tbl1(){ + const ulutd t = { .s = {2, 3, 0, 1, 6, 7, 4, 5} }; + return t.v; + } + static inline lutd tbl_exch1a(){ // Exchange1 + const ulutd t = { .s = {0, 1, 4, 5, 2, 3, 6, 7} }; + return t.v; + } + static inline lutd tbl_exch1b(){ // Exchange1 + const ulutd t = { .s = {2, 3, 6, 7, 0, 1, 4, 5} }; + return t.v; + } + static inline lutd tbl_exch1c(){ // Exchange1 + const ulutd t = { .s = {4, 5, 0, 1, 6, 7, 2, 3} }; + return t.v; + } + static inline pred pg1(){return svptrue_b64();} + static inline pred pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} + static inline pred pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} + static inline vecd zero(){return svdup_f64(0.);} +}; + +template <> +struct acle{ + // exchange neighboring elements + static inline lutf tbl_swap(){ + const ulutf t = { .s = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; + return t.v; + } + static inline lutf tbl0(){ + const ulutf t = { .s = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7} }; + return t.v; + } + static inline lutf tbl1(){ + const ulutf t = { .s = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11} }; + return t.v; + } + static inline lutf tbl2(){ + const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} }; + return t.v; + } + static inline lutf tbl_exch1a(){ // Exchange1 + const ulutf t = { .s = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 } }; + return t.v; + } + static inline lutf tbl_exch1b(){ // Exchange1 + const ulutf t = { .s = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 } }; + return t.v; + } + static inline lutf tbl_exch1c(){ // Exchange1 + const ulutf t = { .s = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7} }; + return t.v; + } + static inline pred pg1(){return svptrue_b32();} + static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} + static inline vecf zero(){return svdup_f32(0.);} +}; + +template <> +struct acle{ + static inline pred pg1(){return svptrue_b16();} + static inline pred pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());} + static inline pred pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());} + static inline vech zero(){return svdup_f16(0.);} +}; + +template <> +struct acle{ + //static inline svbool_t pg1(){return svptrue_b16();} + static inline pred pg1(){return svptrue_b32();} + static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} +}; + +// --------------------------------------------------- + +struct Vsplat{ + // Complex float + inline vecf operator()(float a, float b){ + vecf a_v = svdup_f32(a); + vecf b_v = svdup_f32(b); + return svzip1(a_v, b_v); + } + // Real float + inline vecf operator()(float a){ + return svdup_f32(a); + } + // Complex double + inline vecd operator()(double a, double b){ + vecd a_v = svdup_f64(a); + vecd b_v = svdup_f64(b); + return svzip1(a_v, b_v); + } + // Real double + inline vecd operator()(double a){ + return svdup_f64(a); + } + // Integer + inline veci operator()(Integer a){ + return svdup_u32(a); + } +}; + +struct Vstore{ + // Real float + inline void operator()(vecf a, float *D){ + pred pg1 = acle::pg1(); + svst1(pg1, D, a); + } + // Real double + inline void operator()(vecd a, double *D){ + pred pg1 = acle::pg1(); + svst1(pg1, D, a); + } + // Real float + inline void operator()(veci a, Integer *D){ + pred pg1 = acle::pg1(); + svst1(pg1, D, a); + } +}; + +struct Vstream{ + // Real float + inline void operator()(float * a, vecf b){ + pred pg1 = acle::pg1(); + svstnt1(pg1, a, b); + //svst1(pg1, a, b); + } + // Real double + inline void operator()(double * a, vecd b){ + pred pg1 = acle::pg1(); + svstnt1(pg1, a, b); + //svst1(pg1, a, b); + } +}; + +struct Vset{ + // Complex float + inline vecf operator()(Grid::ComplexF *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, (float*)a); + } + // Complex double + inline vecd operator()(Grid::ComplexD *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, (double*)a); + } + // Real float + inline vecf operator()(float *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, a); + } + // Real double + inline vecd operator()(double *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, a); + } + // Integer + inline veci operator()(Integer *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, a); + } +}; + +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// + +struct Sum{ + // Complex/real float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + return svadd_x(pg1, a, b); + } + // Complex/real double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + return svadd_x(pg1, a, b); + } + // Integer + inline veci operator()(veci a, veci b){ + pred pg1 = acle::pg1(); + return svadd_x(pg1, a, b); + } +}; + +struct Sub{ + // Complex/real float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + return svsub_x(pg1, a, b); + } + // Complex/real double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + return svsub_x(pg1, a, b); + } + // Integer + inline veci operator()(veci a, veci b){ + pred pg1 = acle::pg1(); + return svsub_x(pg1, a, b); + } + +}; + +struct Mult{ + // Real float fma + inline vecf operator()(vecf a, vecf b, vecf c){ + pred pg1 = acle::pg1(); + return svmad_x(pg1, b, c, a); + } + // Real double fma + inline vecd operator()(vecd a, vecd b, vecd c){ + pred pg1 = acle::pg1(); + return svmad_x(pg1, b, c, a); + } + // Real float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + return svmul_x(pg1, a, b); + } + // Real double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + return svmul_x(pg1, a, b); + } + // Integer + inline veci operator()(veci a, veci b){ + pred pg1 = acle::pg1(); + return svmul_x(pg1, a, b); + } +}; + +struct MultRealPart{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + // using FCMLA + vecf z_v = acle::zero(); + return svcmla_x(pg1, z_v, a, b, 0); + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + // using FCMLA + vecd z_v = acle::zero(); + return svcmla_x(pg1, z_v, a, b, 0); + } +}; + +struct MaddRealPart{ + // Complex float + inline vecf operator()(vecf a, vecf b, vecf c){ + pred pg1 = acle::pg1(); + // using FCMLA + return svcmla_x(pg1, c, a, b, 0); + } + // Complex double + inline vecd operator()(vecd a, vecd b, vecd c){ + pred pg1 = acle::pg1(); + // using FCMLA + return svcmla_x(pg1, c, a, b, 0); + } +}; + +struct MultComplex{ + // Complex a*b + // Complex float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + vecf z = acle::zero(); + // using FCMLA + vecf r_v = svcmla_x(pg1, z, a, b, 0); + return svcmla_x(pg1, r_v, a, b, 90); + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + vecd z = acle::zero(); + // using FCMLA + vecd r_v = svcmla_x(pg1, z, a, b, 0); + return svcmla_x(pg1, r_v, a, b, 90); + } +}; + +struct MultAddComplex{ + // Complex a*b+c + // Complex float + inline vecf operator()(vecf a, vecf b, vecf c){ + pred pg1 = acle::pg1(); + // using FCMLA + vecf r_v = svcmla_x(pg1, c, a, b, 0); + return svcmla_x(pg1, r_v, a, b, 90); + } + // Complex double + inline vecd operator()(vecd a, vecd b, vecd c){ + pred pg1 = acle::pg1(); + // using FCMLA + vecd r_v = svcmla_x(pg1, c, a, b, 0); + return svcmla_x(pg1, r_v, a, b, 90); + } +}; + +struct Div{ + // Real float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + return svdiv_x(pg1, a, b); + } + // Real double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + return svdiv_x(pg1, a, b); + } +}; + +struct Conj{ + // Complex float + inline vecf operator()(vecf a){ + pred pg_odd = acle::pg_odd(); + //return svneg_x(pg_odd, a); this is unsafe + return svneg_m(a, pg_odd, a); + } + // Complex double + inline vecd operator()(vecd a){ + pred pg_odd = acle::pg_odd(); + //return svneg_x(pg_odd, a); this is unsafe + return svneg_m(a, pg_odd, a); + } +}; + +struct TimesMinusI{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + lutf tbl_swap = acle::tbl_swap(); + pred pg1 = acle::pg1(); + pred pg_odd = acle::pg_odd(); + + vecf a_v = svtbl(a, tbl_swap); + //return svneg_x(pg_odd, a_v); this is unsafe + return svneg_m(a_v, pg_odd, a_v); + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + lutd tbl_swap = acle::tbl_swap(); + pred pg1 = acle::pg1(); + pred pg_odd = acle::pg_odd(); + + vecd a_v = svtbl(a, tbl_swap); + //return svneg_x(pg_odd, a_v); this is unsafe + return svneg_m(a_v, pg_odd, a_v); + } +}; + +struct TimesI{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + lutf tbl_swap = acle::tbl_swap(); + pred pg1 = acle::pg1(); + pred pg_even = acle::pg_even(); + + vecf a_v = svtbl(a, tbl_swap); + //return svneg_x(pg_even, a_v); this is unsafe + return svneg_m(a_v, pg_even, a_v); + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + lutd tbl_swap = acle::tbl_swap(); + pred pg1 = acle::pg1(); + pred pg_even = acle::pg_even(); + + vecd a_v = svtbl(a, tbl_swap); + //return svneg_x(pg_even, a_v); this is unsafe + return svneg_m(a_v, pg_even, a_v); + } +}; + +struct PrecisionChange { + static inline vech StoH (vecf sa, vecf sb) { + pred pg1s = acle::pg1(); + vech ha_v = svcvt_f16_x(pg1s, sa); + vech hb_v = svcvt_f16_x(pg1s, sb); + return svuzp1(ha_v, hb_v); + } + static inline void HtoS(vech h,vecf &sa,vecf &sb) { + pred pg1s = acle::pg1(); + vech ha_v = svzip1(h, h); + vech hb_v = svzip2(h, h); + sa = svcvt_f32_x(pg1s, ha_v); + sb = svcvt_f32_x(pg1s, hb_v); + } + static inline vecf DtoS (vecd a,vecd b) { + pred pg1d = acle::pg1(); + vecf sa_v = svcvt_f32_x(pg1d, a); + vecf sb_v = svcvt_f32_x(pg1d, b); + return svuzp1(sa_v, sb_v); + } + static inline void StoD (vecf s,vecd &a,vecd &b) { + pred pg1d = acle::pg1(); + vecf sa_v = svzip1(s, s); + vecf sb_v = svzip2(s, s); + a = svcvt_f64_x(pg1d, sa_v); + b = svcvt_f64_x(pg1d, sb_v); + } + static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { + pred pg1d = acle::pg1(); + pred pg1h = acle::pg1(); + vech ha_v = svcvt_f16_x(pg1d, a); + vech hb_v = svcvt_f16_x(pg1d, b); + vech hc_v = svcvt_f16_x(pg1d, c); + vech hd_v = svcvt_f16_x(pg1d, d); + vech hab_v = svuzp1(ha_v, hb_v); + vech hcd_v = svuzp1(hc_v, hd_v); + return svuzp1(hab_v, hcd_v); + +/* + vecf sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); +*/ + } + static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + pred pg1h = acle::pg1(); + pred pg1d = acle::pg1(); + vech sa_v = svzip1(h, h); + vech sb_v = svzip2(h, h); + vech da_v = svzip1(sa_v, sa_v); + vech db_v = svzip2(sa_v, sa_v); + vech dc_v = svzip1(sb_v, sb_v); + vech dd_v = svzip2(sb_v, sb_v); + a = svcvt_f64_x(pg1d, da_v); + b = svcvt_f64_x(pg1d, db_v); + c = svcvt_f64_x(pg1d, dc_v); + d = svcvt_f64_x(pg1d, dd_v); + +/* + vecf sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); +*/ + } +}; + +struct Exchange{ + // float + static inline void Exchange0(vecf &out1, vecf &out2, vecf in1, vecf in2){ + vecf r1_v = svext(in1, in1, (uint64_t)8u); + vecf r2_v = svext(in2, in2, (uint64_t)8u); + out1 = svext(r1_v, in2, (uint64_t)8u); + out2 = svext(in1, r2_v, (uint64_t)8u); + } + static inline void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2){ + // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 + // alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI + lutf tbl_exch1a = acle::tbl_exch1a(); + lutf tbl_exch1b = acle::tbl_exch1b(); + lutf tbl_exch1c = acle::tbl_exch1c(); + + vecf a1_v = svtbl(in1, tbl_exch1a); + vecf a2_v = svtbl(in2, tbl_exch1b); + vecf b1_v = svext(a2_v, a1_v, (uint64_t)8u); + vecf b2_v = svext(a1_v, a2_v, (uint64_t)8u); + out1 = svtbl(b1_v, tbl_exch1c); + out2 = svtbl(b2_v, tbl_exch1a); + } + static inline void Exchange2(vecf &out1, vecf &out2, vecf in1, vecf in2){ + out1 = (vecf)svtrn1((vecd)in1, (vecd)in2); + out2 = (vecf)svtrn2((vecd)in1, (vecd)in2); + } + static inline void Exchange3(vecf &out1, vecf &out2, vecf in1, vecf in2){ + out1 = svtrn1(in1, in2); + out2 = svtrn2(in1, in2); + } + + // double + static inline void Exchange0(vecd &out1, vecd &out2, vecd in1, vecd in2){ + vecd r1_v = svext(in1, in1, (uint64_t)4u); + vecd r2_v = svext(in2, in2, (uint64_t)4u); + out1 = svext(r1_v, in2, (uint64_t)4u); + out2 = svext(in1, r2_v, (uint64_t)4u); + } + static inline void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2){ + // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 + // alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI + lutd tbl_exch1a = acle::tbl_exch1a(); + lutd tbl_exch1b = acle::tbl_exch1b(); + lutd tbl_exch1c = acle::tbl_exch1c(); + + vecd a1_v = svtbl(in1, tbl_exch1a); + vecd a2_v = svtbl(in2, tbl_exch1b); + vecd b1_v = svext(a2_v, a1_v, (uint64_t)4u); + vecd b2_v = svext(a1_v, a2_v, (uint64_t)4u); + out1 = svtbl(b1_v, tbl_exch1c); + out2 = svtbl(b2_v, tbl_exch1a); + } + static inline void Exchange2(vecd &out1, vecd &out2, vecd in1, vecd in2){ + out1 = svtrn1(in1, in2); + out2 = svtrn2(in1, in2); + } + static inline void Exchange3(vecd &out1, vecd &out2, vecd in1, vecd in2){ + assert(0); + return; + } +}; + +#undef VECTOR_FOR + +struct Permute{ + // float + static inline vecf Permute0(vecf in) { + return svext(in, in, (uint64_t)8u); + } + static inline vecf Permute1(vecf in) { + lutf tbl_swap = acle::tbl1(); + return svtbl(in, tbl_swap); + } + static inline vecf Permute2(vecf in) { + lutf tbl_swap = acle::tbl2(); + return svtbl(in, tbl_swap); + } + static inline vecf Permute3(vecf in) { + lutf tbl_swap = acle::tbl_swap(); + return svtbl(in, tbl_swap); + } + + // double + static inline vecd Permute0(vecd in) { + return svext(in, in, (uint64_t)(8u / 2u)); + } + static inline vecd Permute1(vecd in) { + lutd tbl_swap = acle::tbl1(); + return svtbl(in, tbl_swap); + } + static inline vecd Permute2(vecd in) { + lutd tbl_swap = acle::tbl_swap(); + return svtbl(in, tbl_swap); + } + static inline vecd Permute3(vecd in) { + return in; + } +}; + +struct Rotate{ + + static inline vecf rotate(vecf in, int n){ + switch(n){ + case 0: return tRotate<0>(in); break; + case 1: return tRotate<1>(in); break; + case 2: return tRotate<2>(in); break; + case 3: return tRotate<3>(in); break; + case 4: return tRotate<4>(in); break; + case 5: return tRotate<5>(in); break; + case 6: return tRotate<6>(in); break; + case 7: return tRotate<7>(in); break; + + case 8: return tRotate<8>(in); break; + case 9: return tRotate<9>(in); break; + case 10: return tRotate<10>(in); break; + case 11: return tRotate<11>(in); break; + case 12: return tRotate<12>(in); break; + case 13: return tRotate<13>(in); break; + case 14: return tRotate<14>(in); break; + case 15: return tRotate<15>(in); break; + default: assert(0); + } + } + static inline vecd rotate(vecd in, int n){ + switch(n){ + case 0: return tRotate<0>(in); break; + case 1: return tRotate<1>(in); break; + case 2: return tRotate<2>(in); break; + case 3: return tRotate<3>(in); break; + case 4: return tRotate<4>(in); break; + case 5: return tRotate<5>(in); break; + case 6: return tRotate<6>(in); break; + case 7: return tRotate<7>(in); break; + default: assert(0); + } + } + + template static inline vecf tRotate(vecf in){ + return svext(in, in, (uint64_t)n); + } + template static inline vecd tRotate(vecd in){ + return svext(in, in, (uint64_t)n); + } +}; + +// tree-based reduction +#define svred(pg, v)\ +svaddv(pg, v); + +// left-to-right reduction +// #define svred(pg, v)\ +// svadda(pg, 0, v) + +template +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + //exit(1); + return 0; + } +}; +//Complex float Reduce +template <> +inline Grid::ComplexF Reduce::operator()(vecf in){ + pred pg_even = acle::pg_even(); + pred pg_odd = acle::pg_odd(); + float a = svred(pg_even, in); + float b = svred(pg_odd, in); + return Grid::ComplexF(a, b); +} +//Real float Reduce +template <> +inline Grid::RealF Reduce::operator()(vecf in){ + pred pg1 = acle::pg1(); + return svred(pg1, in); +} +//Complex double Reduce +template <> +inline Grid::ComplexD Reduce::operator()(vecd in){ + pred pg_even = acle::pg_even(); + pred pg_odd = acle::pg_odd(); + double a = svred(pg_even, in); + double b = svred(pg_odd, in); + return Grid::ComplexD(a, b); +} +//Real double Reduce +template <> +inline Grid::RealD Reduce::operator()(vecd in){ + pred pg1 = acle::pg1(); + return svred(pg1, in); +} +//Integer Reduce +template <> +inline Integer Reduce::operator()(veci in){ + pred pg1 = acle::pg1(); + return svred(pg1, in); +} + +#undef svred + +NAMESPACE_END(Optimization); + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types + +typedef vech SIMD_Htype; // Reduced precision type +typedef vecf SIMD_Ftype; // Single precision type +typedef vecd SIMD_Dtype; // Double precision type +typedef veci SIMD_Itype; // Integer type + +// prefetch utilities +inline void v_prefetch0(int size, const char *ptr){}; +inline void prefetch_HINT_T0(const char *ptr){}; + +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; + +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultAddComplex MultAddComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; + +NAMESPACE_END(Grid); diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index b9c6a81b..8b17f75a 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -41,6 +41,11 @@ Author: Peter Boyle namespace Grid { +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +typedef struct { uint16_t x;} half; +#endif +typedef struct Half2_t { half x; half y; } Half2; + #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH ) template @@ -125,14 +130,14 @@ inline accelerator GpuVector operator/(const GpuVector l,const } constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half); -constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2); +constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2); constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float); constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2); constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double); constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2); constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer); -typedef GpuComplex GpuComplexH; +typedef GpuComplex GpuComplexH; typedef GpuComplex GpuComplexF; typedef GpuComplex GpuComplexD; @@ -147,11 +152,9 @@ typedef GpuVector GpuVectorI; accelerator_inline float half2float(half h) { float f; -#ifdef GRID_SIMT +#if defined(GRID_CUDA) || defined(GRID_HIP) f = __half2float(h); #else - //f = __half2float(h); - __half_raw hr(h); Grid_half hh; hh.x = hr.x; f= sfw_half_to_float(hh); @@ -161,13 +164,11 @@ accelerator_inline float half2float(half h) accelerator_inline half float2half(float f) { half h; -#ifdef GRID_SIMT +#if defined(GRID_CUDA) || defined(GRID_HIP) h = __float2half(f); #else Grid_half hh = sfw_float_to_half(f); - __half_raw hr; - hr.x = hh.x; - h = __half(hr); + h.x = hh.x; #endif return h; } @@ -523,7 +524,7 @@ namespace Optimization { //////////////////////////////////////////////////////////////////////////////////// // Single / Half //////////////////////////////////////////////////////////////////////////////////// - static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { + static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { int N = GpuVectorCF::N; GpuVectorCH h; for(int i=0;i(0x0u); unsigned int sign = f.u & sign_mask; f.u ^= sign; @@ -93,7 +93,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { o.x = static_cast(f.u - denorm_magic.u); } else { unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd - + // update exponent, rounding bias part 1 f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; // rounding bias part 2 @@ -101,7 +101,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { // take the bits! o.x = static_cast(f.u >> 13); } - } + } o.x |= static_cast(sign >> 16); return o; } @@ -110,9 +110,63 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #ifdef GPU_VEC #include "Grid_gpu_vec.h" #endif +/* #ifdef GEN #include "Grid_generic.h" #endif +*/ + +#ifdef GEN + #if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here + #include + #if defined(A64FX) // VLA + #pragma message("building A64FX / SVE ACLE VLA") + #if defined(ARMCLANGCOMPAT) + #pragma message("applying data types patch") + #endif + #include "Grid_a64fx-2.h" + #endif + #if defined(A64FXFIXEDSIZE) // fixed size data types + #pragma message("building for A64FX / SVE ACLE fixed size") + #include "Grid_a64fx-fixedsize.h" + #endif + #else + //#pragma message("building GEN") // generic + #include "Grid_generic.h" + #endif +#endif + +#ifdef A64FX + #include + #ifdef __ARM_FEATURE_SVE_BITS + //#pragma message("building A64FX SVE VLS") + #include "Grid_a64fx-fixedsize.h" + #else + #pragma message("building A64FX SVE VLA") + #if defined(ARMCLANGCOMPAT) + #pragma message("applying data types patch") + #endif + #include "Grid_a64fx-2.h" + #endif +#endif + +/* +#ifdef A64FXVLA +#pragma message("building A64FX VLA") +#if defined(ARMCLANGCOMPAT) + #pragma message("applying data types patch") +#endif +#include +#include "Grid_a64fx-2.h" +#endif + +#ifdef A64FXVLS +#pragma message("building A64FX VLS") +#include +#include "Grid_a64fx-fixedsize.h" +#endif +*/ + #ifdef SSE4 #include "Grid_sse4.h" #endif @@ -163,6 +217,12 @@ template struct is_complex : public std::false_type {}; template <> struct is_complex : public std::true_type {}; template <> struct is_complex : public std::true_type {}; +template struct is_ComplexD : public std::false_type {}; +template <> struct is_ComplexD : public std::true_type {}; + +template struct is_ComplexF : public std::false_type {}; +template <> struct is_ComplexF : public std::true_type {}; + template struct is_real : public std::false_type {}; template struct is_real::value, void>::type> : public std::true_type {}; @@ -170,7 +230,7 @@ template struct is_real struct is_integer : public std::false_type {}; template struct is_integer::value, void>::type> : public std::true_type {}; - + template using IfReal = Invoke::value, int> >; template using IfComplex = Invoke::value, int> >; template using IfInteger = Invoke::value, int> >; @@ -223,6 +283,69 @@ public: return sizeof(Vector_type) / sizeof(Scalar_type); } + #ifdef ARMCLANGCOMPAT + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v))); + return *this; + }; + + /* + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v))); + return *this; + }; + */ + + // ComplexF + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b32(), (float*)this, svld1(svptrue_b32(), (float*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b32(), (float*)this, svld1(svptrue_b32(), (float*)&(rhs.v))); + return *this; + }; + + // ComplexD + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b64(), (double*)this, svld1(svptrue_b64(), (double*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b64(), (double*)this, svld1(svptrue_b64(), (double*)&(rhs.v))); + return *this; + }; + + #else + accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) { v = rhs.v; return *this; @@ -232,10 +355,23 @@ public: return *this; }; // faster than not declaring it and leaving to the compiler + #endif accelerator Grid_simd() = default; - accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps - accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){}; + + #ifdef ARMCLANGCOMPAT + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); } + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); } + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); } + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); } + #else + accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps + accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){}; + #endif accelerator_inline Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); }; // Enable if complex type template accelerator_inline @@ -258,12 +394,21 @@ public: /////////////////////////////////////////////// // FIXME -- alias this to an accelerator_inline MAC struct. + + #if defined(A64FX) || defined(A64FXFIXEDSIZE) + friend accelerator_inline void mac(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ a, + const Grid_simd *__restrict__ x) { + *y = fxmac((*a), (*x), (*y)); + }; + #else friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x) { *y = (*a) * (*x) + (*y); }; - + #endif + friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r) { @@ -412,7 +557,7 @@ public: Grid_simd ret; Grid_simd::conv_t conv; Grid_simd::scalar_type s; - + conv.v = v.v; for (int i = 0; i < Nsimd(); i++) { s = conv.s[i]; @@ -441,7 +586,7 @@ public: return ret; } /////////////////////// - // Exchange + // Exchange // Al Ah , Bl Bh -> Al Bl Ah,Bh /////////////////////// friend accelerator_inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) @@ -452,20 +597,20 @@ public: Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); } else if(n==1) { Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); - } else if(n==0) { + } else if(n==0) { Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); } } - friend accelerator_inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); } - friend accelerator_inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); } - friend accelerator_inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); } - friend accelerator_inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); } //////////////////////////////////////////////////////////////////// @@ -490,7 +635,7 @@ public: int dist = perm & 0xF; y = rotate(b, dist); return; - } + } else if(perm==3) permute3(y, b); else if(perm==2) permute2(y, b); else if(perm==1) permute1(y, b); @@ -564,29 +709,29 @@ accelerator_inline Grid_simd rotate(Grid_simd b, int nrot) { ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); return ret; } -template =0> +template =0> accelerator_inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,nrot); } -template =0> +template =0> accelerator_inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,2*nrot); } -template +template accelerator_inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); -} -template =0> +} +template =0> accelerator_inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; ret.v = unary(real(typepun[lane]), VsplatSIMD()); -} +} @@ -741,6 +886,27 @@ accelerator_inline Grid_simd operator*(Grid_simd a, Grid_simd return ret; }; +// ---------------- A64FX MAC ------------------- +// Distinguish between complex types and others +#if defined(A64FX) || defined(A64FXFIXEDSIZE) +template = 0> +accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, Grid_simd c) { + Grid_simd ret; + ret.v = trinary(a.v, b.v, c.v, MultAddComplexSIMD()); + return ret; +}; + +// Real/Integer types +template = 0> +accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, Grid_simd c) { + Grid_simd ret; + ret.v = trinary(a.v, b.v, c.v, MultSIMD()); + return ret; +}; +#endif +// ---------------------------------------------- + + // Distinguish between complex types and others template = 0> accelerator_inline Grid_simd operator/(Grid_simd a, Grid_simd b) { @@ -877,7 +1043,7 @@ accelerator_inline typename toComplexMapper::Complexified toComplex(const conv.v = in.v; for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - assert(conv.s[i + 1] == conv.s[i]); + assert(conv.s[i + 1] == conv.s[i]); // trap any cases where real was not duplicated // indicating the SIMD grids of real and imag assignment did not correctly // match @@ -919,6 +1085,14 @@ accelerator_inline void precisionChange(vRealD *out,vRealF *in,int nvec) for(int m=0;m*2 25% peak FMA +# +# In: 3x 512 bits = 192 bytes +# Out: 1x 512 bits = 64 bytes +# Tot: 4x 512 bits = 256 bytes +# +# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2) + +OPT = """ +* interleave prefetching and compute in MULT_2SPIN +* could test storing U's in MULT_2SPIN to L1d for cache line update +* structure reordering: MAYBEPERM after MULT_2SPIN ? +""" + +filename = 'XXX' +LEGAL = """/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: {} + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +""" + +class Register: + + def __init__(self, variable, asmreg='X', predication=False): + global d + x = 'Y' + if predication == False: + x = asmreg # + d['asmsuffix'] + else: + x = asmreg + self.asmreg = x + self.asmregwithsuffix = asmreg + d['asmsuffix'] + self.asmregbyte = asmreg + '.b' + self.name = variable + self.asmname = variable + self.asmnamebyte = variable + '.b' + self.predication = predication + + d['registers'] += 1 + + def define(self, statement): + global d + d['C'] += F'#define {self.name} {statement}' + #d['A'] += F'#define {self.name} {statement}' + + def declare(self, predication=False): + global d + + if self.predication == False: + d['C'] += F' Simd {self.name}; \\\n' + + predtype = 'svfloat64_t' + if PRECISION == 'single': + predtype = 'svfloat32_t' + + d['I'] += F' {predtype} {self.name}; \\\n' + else: + d['I'] += F' svbool_t {self.name}; \\\n' + #d['A'] += F'#define {self.name} {self.asmreg} \n' + + def loadpredication(self, target='A'): + global d + if (target == 'A'): + d['A'] += F' "ptrue {self.asmregwithsuffix} \\n\\t" \\\n' + d['asmclobber'].append(F'"{self.asmreg}"') + + def loadtable(self, t): + global d + d['load'] += d['factor'] + gpr = d['asmtableptr'] + + cast = 'uint64_t' + #asm_opcode = 'ld1d' + #if PRECISION == 'single': + # asm_opcode = 'ld1w' + # cast = 'uint32_t' + asm_opcode = 'ldr' + if PRECISION == 'single': + asm_opcode = 'ldr' + cast = 'uint32_t' + + d['I'] += F' {self.name} = svld1(pg1, ({cast}*)&lut[{t}]); \\\n' + + # using immediate index break-out works + if asm_opcode == 'ldr': + # ldr version + d['A'] += F' "{asm_opcode} {self.asmreg}, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' + else: + # ld1 version + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' + + d['asminput'].append(F'[tableptr] "r" (&lut[0])') + d['asminput'].append(F'[index] "i" ({t})') + d['asmclobber'].append(F'"memory"') + d['asmclobber'].append(F'"cc"') + + def load(self, address, target='ALL', cast='float64_t', colors=3, offset=FETCH_BASE_PTR_COLOR_OFFSET): + global d + d['load'] += d['factor'] + indices = re.findall(r'\d+', address) + index = (int(indices[0]) - offset) * colors + int(indices[1]) + + #asm_opcode = 'ld1d' + #if PRECISION == 'single': + #asm_opcode = 'ld1w' + # cast = 'float32_t' + + asm_opcode = 'ldr' + if PRECISION == 'single': + asm_opcode = 'ldr' + cast = 'float32_t' + + gpr = d['asmfetchbaseptr'] + intrinfetchbase = d['intrinfetchbase'] + if (target in ['ALL', 'C']): + d['C'] += F' {self.name} = {address}; \\\n' + if (target in ['ALL', 'I']): +# d['I'] += F' {self.name} = svldnt1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' + d['I'] += F' {self.name} = svld1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' + if (target in ['ALL', 'A']): + if asm_opcode == 'ldr': + d['A'] += F' "{asm_opcode} {self.asmreg}, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' + else: + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' + + def store(self, address, cast='float64_t', colors=3, offset=STORE_BASE_PTR_COLOR_OFFSET): + global d + d['store'] += d['factor'] + indices = re.findall(r'\d+', address) + index = (int(indices[0]) - offset) * colors + int(indices[1]) + + #asm_opcode = 'stnt1d' + #if PRECISION == 'single': + # asm_opcode = 'stnt1w' + # cast = 'float32_t' + asm_opcode = 'str' + if PRECISION == 'single': + asm_opcode = 'str' + cast = 'float32_t' + + intrinstorebase = d['intrinstorebase'] + + d['C'] += F' {address} = {self.name}; \\\n' + #d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' + d['I'] += F' svst1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' + if asm_opcode == 'str': + d['A'] += F' "{asm_opcode} {self.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' + else: + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' + + def movestr(self, str): + global d + #d['move'] += d['factor'] + d['I'] += F' {self.name} = {str}; \\\n' + + def move(self, op1): + global d + d['move'] += d['factor'] + d['C'] += F' {self.name} = {op1.name}; \\\n' + d['I'] += F' {self.name} = {op1.name}; \\\n' + d['A'] += F' "mov {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' + + # a = a + b , a = b + c + def add(self, op1, op2=None): + global d + d['add'] += d['factor'] + if op2 is None: + d['C'] += F' {self.name} = {self.name} + {op1.name}; \\\n' + d['I'] += F' {self.name} = svadd_x(pg1, {self.name}, {op1.name}); \\\n' + d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} + {op2.name}; \\\n' + d['I'] += F' {self.name} = svadd_x(pg1, {op1.name}, {op2.name}); \\\n' + d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' + + # a = a -b , a = b - c + def sub(self, op1, op2=None): + global d + d['sub'] += d['factor'] + if op2 is None: + d['C'] += F' {self.name} = {self.name} - {op1.name}; \\\n' + d['I'] += F' {self.name} = svsub_x(pg1, {self.name}, {op1.name}); \\\n' + d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} - {op2.name}; \\\n' + d['I'] += F' {self.name} = svsub_x(pg1, {op1.name}, {op2.name}); \\\n' + d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' + + # a = a * b , a = b * c + def mul(self, op1, op2): + global d + d['mul'] += 2 * d['factor'] + d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' + d['I'] += F' {self.name} = __svzero({self.name}); \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "mov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def mul0(self, op1, op2, op3=None, constructive=False): + global d + d['mul'] += d['factor'] + + # no movprfx intrinsics support + if constructive == True: + d['movprfx'] += d['factor'] + d['I'] += F' {self.name} = svcmla_x(pg1, {op1.name}, {op2.name}, {op3.name}, 0); \\\n' + d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op2.asmregwithsuffix}, {op3.asmregwithsuffix}, 0 \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' + + def mul1(self, op1, op2): + global d + d['mul'] += d['factor'] + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def mac(self, op1, op2): + global d + d['mac'] += 2 * d['factor'] + d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def mac0(self, op1, op2): + global d + d['mac'] += d['factor'] + d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' + + def mac1(self, op1, op2): + global d + d['mac'] += d['factor'] + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def zero(self, zeroreg=False): + d['zero'] += d['factor'] + d['C'] += F' {self.name} = 0; \\\n' + #d['I'] += F' {self.name} = __svzero({self.name}); \\\n' only armclang + + if PRECISION == 'double': + d['I'] += F' {self.name} = svdup_f64(0.); \\\n' + else: + d['I'] += F' {self.name} = svdup_f32(0.); \\\n' + + if zeroreg == True: + d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' + else: + #using mov z, zero0 issue 1c, FLA, latency 6c + #d['A'] += F' "mov {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' + + #using mov z, 0 issue 1c, FLA, latency 6c + d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' + + #using xor z, z, z issue 0.5c, FL*, latency 4c + #d['A'] += F' "eor {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' + + #using and z, z, zero0 issue 0.5c, FL*, latency 4c + #d['A'] += F' "and {self.asmregwithsuffix}, {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' + + #using sub z, z, z issue 0.5c, FL*, latency 9c + #d['A'] += F' "sub {self.asmregwithsuffix}, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' + + # without table + def timesI(self, op1, tempreg=None, tablereg=None): + global d + d['timesI'] += d['factor'] + d['C'] += F' {self.name} = timesI({op1.name}); \\\n' + # correct if DEBUG enabled, wrong if DEBUG disabled; no idea what's causing this + #table.load('table2', target='I', cast='uint64_t') + #d['I'] += F' {self.name} = svtbl({op1.name}, {tablereg.name}); \\\n' + #d['I'] += F' {self.name} = svneg_x(pg2, {self.name}); \\\n' + # timesI using trn tested, works but tbl should be faster + d['I'] += F' {tempreg.name} = svtrn2({op1.name}, {op1.name}); \\\n' + d['I'] += F' {tempreg.name} = svneg_x(pg1, {tempreg.name}); \\\n' + d['I'] += F' {self.name} = svtrn1({tempreg.name}, {op1.name}); \\\n' + d['A'] += F' "trn2 {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "fneg {tempreg.asmregwithsuffix}, {pg1.asmreg}/m, {tempreg.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "trn1 {self.asmregwithsuffix}, {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' + + def addTimesI(self, op1, op2=None, constructive=False): + global d + d['addTimesI'] += d['factor'] + + if op2 is None: + d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' + else: + d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' + + # no movprfx intrinsics support + if constructive == True: + d['movprfx'] += d['factor'] + d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + else: + if op2 is None: + d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' + d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 90); \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 90 \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' + d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def subTimesI(self, op1, op2=None, constructive=False): + global d + d['subTimesI'] += d['factor'] + + # no movprfx intrinsics support + if constructive == True: + d['movprfx'] += d['factor'] + d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' + d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' + else: + if op2 is None: + d['C'] += F' {self.name} = {self.name} - timesI({op1.name}); \\\n' + d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 270); \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 270 \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} - timesI({op2.name}); \\\n' + d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' + + # timesMinusI is not used, def is probably wrong !!!! OPTIMIZATION with table + def timesMinusI(self, op1): + global d + d['timesMinusI'] += d['factor'] + d['C'] += F' {self.name} = timesMinusI({self.name}); \\\n' + d['I'] += F' {self.name} = svtrn1({op1.name}, {op1.name}); \\\n' + d['I'] += F' {self.name} = svneg_x(pg1, {self.name}); \\\n' + d['I'] += F' {self.name} = svtrn1({op1.name}, {self.name}); \\\n' + + def permute(self, dir, tablereg=None): + global d + d['permutes'] += d['factor'] + + d['C'] += F' permute{dir}({self.name}, {self.name}); \\\n' + + d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' + d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' + + # if dir == 0: + # d['I'] += F' {self.name} = svext({self.name}, {self.name}, 4); \\\n' + # # this might not work, see intrinsics assembly + # # d['A'] += F' ext {self.name}, {self.name}, {self.name}, #4 \\\n' + # # use registers directly + # d['A'] += F' "ext {self.asmregbyte}, {self.asmregbyte}, {self.asmregbyte}, 32 \\n\\t" \\\n' + # + # elif dir in [1, 2]: + # d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' + # d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' + + def debug(self): + global d + typecast = d['cfloat'] + gpr = d['asmdebugptr'] + vregs = d['asmclobberlist'] + if (d['debug'] == True): + d['C'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' + + d['I'] += F'svst1(pg1, ({typecast}*)&debugreg.v, {self.name}); \\\n' + d['I'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' + #d['I'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' + + d['A'] += F'asm ( \\\n' + d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier + d['A'] += F' "str {self.asmreg}, [%[ptr]] \\n\\t" \\\n' + d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier + d['A'] += F' : "=m" (debugreg.v) \\\n' + d['A'] += F' : [ptr] "r" (&debugreg.v) \\\n' + d['A'] += F' : "p5", "cc", "memory" \\\n' + d['A'] += F'); \\\n' + d['A'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' + # this form of addressing is not valid! + #d['A'] += F' "str {self.asmreg}, %[ptr] \\n\\t" \\\n' +# end Register + +def define(s, target='ALL'): + x = F'#define {s} \n' + global d + if (target in ['ALL', 'C']): + d['C'] += x + if (target in ['ALL', 'I']): + d['I'] += x + if (target in ['ALL', 'A']): + d['A'] += x + +def definemultiline(s): + x = F'#define {s} \\\n' + global d + d['C'] += x + d['I'] += x + d['A'] += x + +def write(s, target='ALL'): + x = F'{s}\n' + global d + if (target in ['ALL', 'C']): + d['C'] += x + if (target in ['ALL', 'I']): + d['I'] += x + if (target in ['ALL', 'A']): + d['A'] += x + +def curlyopen(): + write(F'{{ \\') + +def curlyclose(): + write(F'}}') + +def newline(target='ALL'): + global d + + if target == 'A': + if d['A'][-2:] == '\\\n': + d['A'] = d['A'][:-2] + '\n\n' + else: + if d['C'][-2:] == '\\\n': + d['C'] = d['C'][:-2] + '\n\n' + if d['I'][-2:] == '\\\n': + d['I'] = d['I'][:-2] + '\n\n' + if d['A'][-2:] == '\\\n': + d['A'] = d['A'][:-2] + '\n\n' + +# load the base pointer for fetches +def fetch_base_ptr(address, target='A'): + global d + #d['load'] += d['factor'] + + # DEBUG + #colors=3 + #indices = re.findall(r'\d+', address) + #index = (int(indices[0]) - FETCH_BASE_PTR_COLOR_OFFSET) * colors + int(indices[1]) + #print(F'{address} (base)') + + vregs = d['asmclobberlist'] + if target == 'A': + d['asminput'].append(F'[fetchptr] "r" ({address})') + d['asmclobber'].extend(vregs) + d['asmclobber'].append(F'"memory"') + d['asmclobber'].append(F'"cc"') + if target == 'I': + #print("intrinfetchbase = ", address) + d['intrinfetchbase'] = address + +# load the base pointer for stores +def store_base_ptr(address, target='A'): + global d + #d['load'] += d['factor'] + gpr = d['asmstorebaseptr'] + vregs = d['asmclobberlist'] + if target == 'A': + d['asminput'].append(F'[storeptr] "r" ({address})') + d['asmclobber'].extend(vregs) + d['asmclobber'].append(F'"memory"') + d['asmclobber'].append(F'"cc"') + if target == 'I': + d['intrinstorebase'] = address + +def prefetch_L1(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PLDL1STRM" # weak + #policy = "PLDL1KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + +def prefetch_L2(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PLDL2STRM" # weak + #policy = "PLDL2KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + #d['A'] += + +def prefetch_L2_store(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PSTL2STRM" # weak + #policy = "PSTL2KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + +def prefetch_L1_store(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PSTL1STRM" # weak + #policy = "PSTL2KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + + +def asmopen(): + #write('asm volatile ( \\', target='A') + write('asm ( \\', target='A') + + # DEBUG + #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier + #write('asm volatile ( \\', target='A') + +def asmclose(): + global d + + #print(d['asminput']) + + asmin = d['asminput'] + asmin_s = '' + if len(asmin) > 0: + asmin = list(dict.fromkeys(asmin)) # remove duplicates + #print(asmin) + for el in asmin: + asmin_s += el + ',' + asmin_s = asmin_s[:-1] + #print("-> ", asmin_s) + + d['asminput'] = [] + + asmout = d['asmoutput'] + asmout_s = '' + if len(asmout) > 0: + asmout = list(dict.fromkeys(asmout)) # remove duplicates + for el in asmout: + asmout_s += el + ',' + asmout_s = asmout_s[:-1] + + d['asmoutput'] = [] + + # DEBUG put all regs into clobber by default + d['asmclobber'].extend(d['asmclobberlist']) + + asmclobber = d['asmclobber'] + asmclobber_s = '' + #print(asmclobber) + if len(asmclobber) > 0: + asmclobber = list(dict.fromkeys(asmclobber)) # remove duplicates + for el in asmclobber: + asmclobber_s += el + ',' + asmclobber_s = asmclobber_s[:-1] + + d['asmclobber'] = [] + + # DEBUG + #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier + + + write(F' : {asmout_s} \\', target='A') + write(F' : {asmin_s} \\', target='A') + write(F' : {asmclobber_s} \\', target='A') + write('); \\', target='A') + +# -------------------------------------------------------------------------------- + +# string of vector registers to be used in clobber list +#clobberlist = ['"p0"'] +clobberlist = ['"p5"'] +clobberlist.append('"cc"') +for i in range(0, 32): + clobberlist.append(F'"z{i}"') + +d = { +'debug': _DEBUG, +'C': '', +'I': '', +'A': '', +'asmsuffix': '.d', # double precision by default +'cfloat': 'float64_t', +'registers': 0, +'load': 0, +'store': 0, +'move': 0, +'movprfx': 0, +'zero': 0, +'add': 0, +'sub': 0, +'mul': 0, +'mac': 0, +'permutes': 0, +'neg': 0, +'addTimesI': 0, +'subTimesI': 0, +'timesI': 0, +'timesMinusI': 0, +'flops': 0, +'factor': 1, # multiplicity +'asmtableptr': 'x30', +'asmfetchbaseptr': 'x29', +'asmstorebaseptr': 'x28', +'asmdebugptr': 'r12', +'asminput': [], +'asmoutput': [], +'asmclobber': [], +'asmclobberlist': clobberlist, +'intrinfetchbase': '', +'intrinstorebase': '', +'cycles_LOAD_CHIMU': 0, +'cycles_PROJ': 0, +'cycles_PERM': 0, +'cycles_MULT_2SPIN': 0, +'cycles_RECON': 0, +'cycles_RESULT': 0, +'cycles_ZERO_PSI': 0, +'cycles_PREFETCH_L1': 0, +'cycles_PREFETCH_L2': 0 +} + +if PRECISION == 'single': + d['asmsuffix'] = '.s' + d['cfloat'] = 'float32_t' + +# -------------------------------------------------------------------------------- +# Grid +# -------------------------------------------------------------------------------- + +# Variables / Registers +result_00 = Register('result_00', asmreg='z0') +result_01 = Register('result_01', asmreg='z1') +result_02 = Register('result_02', asmreg='z2') +result_10 = Register('result_10', asmreg='z3') +result_11 = Register('result_11', asmreg='z4') +result_12 = Register('result_12', asmreg='z5') +result_20 = Register('result_20', asmreg='z6') +result_21 = Register('result_21', asmreg='z7') +result_22 = Register('result_22', asmreg='z8') +result_30 = Register('result_30', asmreg='z9') +result_31 = Register('result_31', asmreg='z10') +result_32 = Register('result_32', asmreg='z11') # 12 Regs +Chi_00 = Register('Chi_00', asmreg='z12') +Chi_01 = Register('Chi_01', asmreg='z13') +Chi_02 = Register('Chi_02', asmreg='z14') +Chi_10 = Register('Chi_10', asmreg='z15') +Chi_11 = Register('Chi_11', asmreg='z16') +Chi_12 = Register('Chi_12', asmreg='z17') # 6 +UChi_00 = Register('UChi_00', asmreg='z18') +UChi_01 = Register('UChi_01', asmreg='z19') +UChi_02 = Register('UChi_02', asmreg='z20') +UChi_10 = Register('UChi_10', asmreg='z21') +UChi_11 = Register('UChi_11', asmreg='z22') +UChi_12 = Register('UChi_12', asmreg='z23') # 6 +U_00 = Register('U_00', asmreg='z24') +U_10 = Register('U_10', asmreg='z25') +U_20 = Register('U_20', asmreg='z26') +U_01 = Register('U_01', asmreg='z27') +U_11 = Register('U_11', asmreg='z28') +U_21 = Register('U_21', asmreg='z29') # 6 -> 30 Registers + +table0 = Register('table0', asmreg='z30') +zero0 = Register('zero0', asmreg='z31') # 2 -> 32 Registers +# can't overload temp1 / table due to type mismatch using intrinsics :( +# typecasting SVE intrinsics variables is not allowed + +pg1 = Register('pg1', predication=True, asmreg='p5') +#pg2 = Register('pg2', predication=True, asmreg='p1') + +# Overloaded with Chi_* and UChi_* +Chimu_00 = Register('Chimu_00', asmreg=Chi_00.asmreg) +Chimu_01 = Register('Chimu_01', asmreg=Chi_01.asmreg) +Chimu_02 = Register('Chimu_02', asmreg=Chi_02.asmreg) +Chimu_10 = Register('Chimu_10', asmreg=Chi_10.asmreg) +Chimu_11 = Register('Chimu_11', asmreg=Chi_11.asmreg) +Chimu_12 = Register('Chimu_12', asmreg=Chi_12.asmreg) +if ALTERNATIVE_REGISTER_MAPPING == False: + Chimu_20 = Register('Chimu_20', asmreg=UChi_00.asmreg) + Chimu_21 = Register('Chimu_21', asmreg=UChi_01.asmreg) + Chimu_22 = Register('Chimu_22', asmreg=UChi_02.asmreg) + Chimu_30 = Register('Chimu_30', asmreg=UChi_10.asmreg) + Chimu_31 = Register('Chimu_31', asmreg=UChi_11.asmreg) + Chimu_32 = Register('Chimu_32', asmreg=UChi_12.asmreg) # 12 Registers +else: # wilson4.h + Chimu_20 = Register('Chimu_20', asmreg=U_00.asmreg) + Chimu_21 = Register('Chimu_21', asmreg=U_10.asmreg) + Chimu_22 = Register('Chimu_22', asmreg=U_20.asmreg) + Chimu_30 = Register('Chimu_30', asmreg=U_01.asmreg) + Chimu_31 = Register('Chimu_31', asmreg=U_11.asmreg) + Chimu_32 = Register('Chimu_32', asmreg=U_21.asmreg) + +# debugging output +def debugall(msg=None, group='ALL'): + global d + if (d['debug'] == False): + return + write(F'std::cout << std::endl << "DEBUG -- {msg}" << std::endl; \\') + if (group in ['ALL', 'result']): + result_00.debug() + result_01.debug() + result_02.debug() + result_10.debug() + result_11.debug() + result_12.debug() + result_20.debug() + result_21.debug() + result_22.debug() + result_30.debug() + result_31.debug() + result_32.debug() + if (group in ['ALL', 'Chi']): + Chi_00.debug() + Chi_01.debug() + Chi_02.debug() + Chi_10.debug() + Chi_11.debug() + Chi_12.debug() + if (group in ['ALL', 'UChi']): + UChi_00.debug() + UChi_01.debug() + UChi_02.debug() + UChi_10.debug() + UChi_11.debug() + UChi_12.debug() + if (group in ['ALL', 'U']): + U_00.debug() + U_10.debug() + U_20.debug() + U_01.debug() + U_11.debug() + U_21.debug() + if (group in ['ALL', 'Chimu']): + Chimu_00.debug() + Chimu_01.debug() + Chimu_02.debug() + Chimu_10.debug() + Chimu_11.debug() + Chimu_12.debug() + Chimu_20.debug() + Chimu_21.debug() + Chimu_22.debug() + Chimu_30.debug() + Chimu_31.debug() + Chimu_32.debug() + +# -------------------------------------------------------------------------------- +# Output +# -------------------------------------------------------------------------------- + +if ALTERNATIVE_LOADS == True: + define(F'LOAD_CHIMU_0213_PLUG LOAD_CHIMU_0213_{PRECSUFFIX}') + define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}') + define(F'LOAD_CHIMU(x)') +else: + #define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)') + define(F'LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') + +if PREFETCH: + define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') + define(F'PF_GAUGE(A)') + define(F'PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)') +# define(F'PREFETCH1_CHIMU(A)') + define(F'PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)') +# define(F'PREFETCH_CHIMU(A)') +else: + define(F'PREFETCH_CHIMU_L1(A)') + define(F'PREFETCH_GAUGE_L1(A)') + define(F'PREFETCH_CHIMU_L2(A)') + define(F'PREFETCH_GAUGE_L2(A)') + define(F'PF_GAUGE(A)') + define(F'PREFETCH1_CHIMU(A)') + define(F'PREFETCH_CHIMU(A)') + define(F'PREFETCH_RESULT_L2_STORE(A)') + +# standard defines +define(F'LOCK_GAUGE(A)') +define(F'UNLOCK_GAUGE(A)') +define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') +define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)') +define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)') +define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}') +define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)') +# don't need zero psi, everything is done in recons +#define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}') +define(F'ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)') +# loads projections +define(F'XP_PROJ XP_PROJ_{PRECSUFFIX}') +define(F'YP_PROJ YP_PROJ_{PRECSUFFIX}') +define(F'ZP_PROJ ZP_PROJ_{PRECSUFFIX}') +define(F'TP_PROJ TP_PROJ_{PRECSUFFIX}') +define(F'XM_PROJ XM_PROJ_{PRECSUFFIX}') +define(F'YM_PROJ YM_PROJ_{PRECSUFFIX}') +define(F'ZM_PROJ ZM_PROJ_{PRECSUFFIX}') +define(F'TM_PROJ TM_PROJ_{PRECSUFFIX}') +# recons +define(F'XP_RECON XP_RECON_{PRECSUFFIX}') +define(F'XM_RECON XM_RECON_{PRECSUFFIX}') +define(F'XM_RECON_ACCUM XM_RECON_ACCUM_{PRECSUFFIX}') +define(F'YM_RECON_ACCUM YM_RECON_ACCUM_{PRECSUFFIX}') +define(F'ZM_RECON_ACCUM ZM_RECON_ACCUM_{PRECSUFFIX}') +define(F'TM_RECON_ACCUM TM_RECON_ACCUM_{PRECSUFFIX}') +define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}') +define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}') +define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}') +define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}') +# new permutes +define(F'PERMUTE_DIR0 0') +define(F'PERMUTE_DIR1 1') +define(F'PERMUTE_DIR2 2') +define(F'PERMUTE_DIR3 3') +define(F'PERMUTE PERMUTE_{PRECSUFFIX};') +# load table +#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}') +if PRECISION == 'double': + define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}') + define(F'MAYBEPERM(Dir,perm) if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}') +else: + define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}') + define(F'MAYBEPERM(A,perm) if (perm) {{ PERMUTE; }}') + + + +write('// DECLARATIONS') +definemultiline(F'DECLARATIONS_{PRECSUFFIX}') +# debugging register +if d['debug'] == True: + write(' Simd debugreg; \\') +# perm tables +if PRECISION == 'double': + write(' const uint64_t lut[4][8] = { \\') + write(' {4, 5, 6, 7, 0, 1, 2, 3}, \\') #0 = swap register halves + write(' {2, 3, 0, 1, 6, 7, 4, 5}, \\') #1 = swap halves of halves + write(' {1, 0, 3, 2, 5, 4, 7, 6}, \\') #2 = swap re/im + write(' {0, 1, 2, 4, 5, 6, 7, 8} };\\') #3 = identity +else: + write(' const uint32_t lut[4][16] = { \\') + write(' {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \\') #0 = swap register halves + write(' {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \\') #1 = swap halves of halves + write(' {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \\') #2 = swap halves of halves of halves + write(' {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \\') #3 = swap re/im + +#newline(target='A') +result_00.declare() +result_01.declare() +result_02.declare() +result_10.declare() +result_11.declare() +result_12.declare() +result_20.declare() +result_21.declare() +result_22.declare() +result_30.declare() +result_31.declare() +result_32.declare() # 12 +Chi_00.declare() +Chi_01.declare() +Chi_02.declare() +Chi_10.declare() +Chi_11.declare() +Chi_12.declare() # 6 +UChi_00.declare() +UChi_01.declare() +UChi_02.declare() +UChi_10.declare() +UChi_11.declare() +UChi_12.declare() # 6 +U_00.declare() +U_10.declare() +U_20.declare() +U_01.declare() +U_11.declare() +U_21.declare() # 6 -> 30 regs + +# all predications true +pg1.declare() +if PRECISION == 'double': + pg1.movestr('svptrue_b64()') +else: + pg1.movestr('svptrue_b32()') + +# tables +if PRECISION == 'double': + write(' svuint64_t table0; \\', target='I') # -> 31 regs +else: + write(' svuint32_t table0; \\', target='I') # -> 31 regs + +zero0.declare() + +# zero register +asmopen() +zero0.zero(zeroreg=True) +asmclose() +newline() + +define('Chimu_00 Chi_00', target='I') +define('Chimu_01 Chi_01', target='I') +define('Chimu_02 Chi_02', target='I') +define('Chimu_10 Chi_10', target='I') +define('Chimu_11 Chi_11', target='I') +define('Chimu_12 Chi_12', target='I') +if ALTERNATIVE_REGISTER_MAPPING == False: + define('Chimu_20 UChi_00', target='I') + define('Chimu_21 UChi_01', target='I') + define('Chimu_22 UChi_02', target='I') + define('Chimu_30 UChi_10', target='I') + define('Chimu_31 UChi_11', target='I') + define('Chimu_32 UChi_12', target='I') +else: # wilson4.h + define('Chimu_20 U_00', target='I') + define('Chimu_21 U_10', target='I') + define('Chimu_22 U_20', target='I') + define('Chimu_30 U_01', target='I') + define('Chimu_31 U_11', target='I') + define('Chimu_32 U_21', target='I') +newline() + + +d['cycles_RESULT'] += 12 +write('// RESULT') +definemultiline(F'RESULT_{PRECSUFFIX}(base)') +if ASM_STORE: + curlyopen() + #write(' SiteSpinor & ref(out[ss]); \\') + asmopen() + #pg1.loadpredication() + #store_base_ptr("&ref[0][0]") + #store_base_ptr(F"&ref[{STORE_BASE_PTR_COLOR_OFFSET}][0]") + store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') + store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') + result_00.store("ref[0][0]") + result_01.store("ref[0][1]") + result_02.store("ref[0][2]") + result_10.store("ref[1][0]") + result_11.store("ref[1][1]") + result_12.store("ref[1][2]") + result_20.store("ref[2][0]") + result_21.store("ref[2][1]") + result_22.store("ref[2][2]") + result_30.store("ref[3][0]") + result_31.store("ref[3][1]") + result_32.store("ref[3][2]") + asmclose() + debugall('RESULT', group='result') + curlyclose() +newline() + +# prefetch spinors from memory into L2 cache +d['factor'] = 0 +d['cycles_PREFETCH_L2'] += 0 * d['factor'] +write('// PREFETCH_CHIMU_L2 (prefetch to L2)') +definemultiline(F'PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +#pg1.loadpredication() +#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") +fetch_base_ptr(F"base", target='A') +prefetch_L2(F"base", 0) +prefetch_L2(F"base", 1) +prefetch_L2(F"base", 2) +asmclose() +curlyclose() +newline() + +# prefetch spinors from memory into L1 cache +d['factor'] = 0 +d['cycles_PREFETCH_L1'] += 0 * d['factor'] +write('// PREFETCH_CHIMU_L1 (prefetch to L1)') +definemultiline(F'PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +#pg1.loadpredication() +fetch_base_ptr(F"base", target='A') +prefetch_L1(F"base", 0) +prefetch_L1(F"base", 1) +prefetch_L1(F"base", 2) +asmclose() +curlyclose() +newline() + +# prefetch gauge from memory into L2 cache +d['factor'] = 0 +d['cycles_PREFETCH_L2'] += 0 * d['factor'] +write('// PREFETCH_GAUGE_L2 (prefetch to L2)') +definemultiline(F'PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') +curlyopen() +if GRIDBENCH: # referencing differs in Grid and GridBench + write(' const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') +else: + write(' const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') +asmopen() +#pg1.loadpredication() +#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") +fetch_base_ptr(F"baseU", target='A') +prefetch_L2(F"baseU", -1) +prefetch_L2(F"baseU", 0) +prefetch_L2(F"baseU", 1) +prefetch_L2(F"baseU", 2) +prefetch_L2(F"baseU", 3) +prefetch_L2(F"baseU", 4) +prefetch_L2(F"baseU", 5) +prefetch_L2(F"baseU", 6) +prefetch_L2(F"baseU", 7) +#prefetch_L2(F"baseU", 8) +asmclose() +curlyclose() +newline() + +# prefetch gauge from memory into L1 cache +d['factor'] = 0 +d['cycles_PREFETCH_L1'] += 0 * d['factor'] +write('// PREFETCH_GAUGE_L1 (prefetch to L1)') +definemultiline(F'PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') +curlyopen() +if GRIDBENCH: # referencing differs in Grid and GridBench + write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') +else: + write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') +asmopen() +#pg1.loadpredication() +#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") +fetch_base_ptr(F"baseU", target='A') +prefetch_L1(F"baseU", 0) +prefetch_L1(F"baseU", 1) +prefetch_L1(F"baseU", 2) +asmclose() +curlyclose() +newline() + +d['factor'] = 0 +write('// LOAD_CHI') +definemultiline(F'LOAD_CHI_{PRECSUFFIX}(base)') +if ASM_LOAD_CHIMU: + curlyopen() + #write(' const SiteSpinor & ref(in[offset]); \\') + asmopen() + #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') + #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') + fetch_base_ptr(F"base", target='I') + fetch_base_ptr(F"base", target='A') + + Chi_00.load("ref[0][0]", offset=0) + Chi_01.load("ref[0][1]", offset=0) + Chi_02.load("ref[0][2]", offset=0) + Chi_10.load("ref[1][0]", offset=0) + Chi_11.load("ref[1][1]", offset=0) + Chi_12.load("ref[1][2]", offset=0) + asmclose() + debugall('LOAD_CHI', group='Chi') + curlyclose() +newline() + + + +d['factor'] = 8 +# 12 loads = 12 issues, load latency = 8+1 cycles +# (not perfectly clear to me from docs) +d['cycles_LOAD_CHIMU'] += 11 * d['factor'] +write('// LOAD_CHIMU') +definemultiline(F'LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') +if ASM_LOAD_CHIMU: + curlyopen() + #write(' const SiteSpinor & ref(in[offset]); \\') + asmopen() + pg1.loadpredication() + #fetch_base_ptr("&ref[0][0]") + #fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") + fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') + fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') + # Chimu_00.load("ref[0][0]") + # Chimu_01.load("ref[0][1]") + # Chimu_02.load("ref[0][2]") + # Chimu_10.load("ref[1][0]") + # Chimu_11.load("ref[1][1]") + # Chimu_12.load("ref[1][2]") + # Chimu_20.load("ref[2][0]") + # Chimu_21.load("ref[2][1]") + # Chimu_22.load("ref[2][2]") + # Chimu_30.load("ref[3][0]") + # Chimu_31.load("ref[3][1]") + # Chimu_32.load("ref[3][2]") + + Chimu_00.load("ref[0][0]") # minimum penalty for all directions + Chimu_30.load("ref[3][0]") + Chimu_10.load("ref[1][0]") + Chimu_20.load("ref[2][0]") + + Chimu_01.load("ref[0][1]") + Chimu_31.load("ref[3][1]") + Chimu_11.load("ref[1][1]") + Chimu_21.load("ref[2][1]") + + Chimu_02.load("ref[0][2]") + Chimu_32.load("ref[3][2]") + Chimu_12.load("ref[1][2]") + Chimu_22.load("ref[2][2]") + asmclose() + debugall('LOAD_CHIMU', group='Chimu') + curlyclose() +newline() + +# alternative load chimu: dirac order 0213 +# placed into asm (...) +d['factor'] = 0 +d['cycles_LOAD_CHIMU'] += 11 * d['factor'] +write('// LOAD_CHIMU_0213') +definemultiline(F'LOAD_CHIMU_0213_{PRECSUFFIX}') +if ASM_LOAD_CHIMU: + curlyopen() + write(' const SiteSpinor & ref(in[offset]); \\') + asmopen() + pg1.loadpredication() + fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") + Chimu_00.load("ref[0][0]") # reordered + Chimu_20.load("ref[2][0]") + + Chimu_01.load("ref[0][1]") + Chimu_21.load("ref[2][1]") + + Chimu_02.load("ref[0][2]") + Chimu_22.load("ref[2][2]") + + Chimu_10.load("ref[1][0]") + Chimu_30.load("ref[3][0]") + + Chimu_11.load("ref[1][1]") + Chimu_31.load("ref[3][1]") + + Chimu_12.load("ref[1][2]") + Chimu_32.load("ref[3][2]") + asmclose() + debugall('LOAD_CHIMU_0213', group='Chimu') + curlyclose() +newline() + +# alternative load chimu: dirac order 0312 +# placed into asm (...) +d['factor'] = 0 +d['cycles_LOAD_CHIMU'] += 11 * d['factor'] +write('// LOAD_CHIMU_0312') +definemultiline(F'LOAD_CHIMU_0312_{PRECSUFFIX}') +if ASM_LOAD_CHIMU: + curlyopen() + write(' const SiteSpinor & ref(in[offset]); \\') + asmopen() + pg1.loadpredication() + fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") + Chimu_00.load("ref[0][0]") # reordered + Chimu_30.load("ref[3][0]") + + Chimu_01.load("ref[0][1]") + Chimu_31.load("ref[3][1]") + + Chimu_02.load("ref[0][2]") + Chimu_32.load("ref[3][2]") + + Chimu_10.load("ref[1][0]") + Chimu_20.load("ref[2][0]") + + Chimu_11.load("ref[1][1]") + Chimu_21.load("ref[2][1]") + + Chimu_12.load("ref[1][2]") + Chimu_22.load("ref[2][2]") + asmclose() + debugall('LOAD_CHIMU_0312', group='Chimu') + curlyclose() +newline() + +d['factor'] = 2 +d['cycles_PERM'] += 1 * d['factor'] +write('// LOAD_TABLE0') +definemultiline(F'LOAD_TABLE0') +asmopen() +table0.loadtable(0) +asmclose() +newline() + +d['factor'] = 2 +d['cycles_PERM'] += 1 * d['factor'] +write('// LOAD_TABLE1') +definemultiline(F'LOAD_TABLE1') +asmopen() +table0.loadtable(1) +asmclose() +newline() + +d['factor'] = 2 +d['cycles_PERM'] += 1 * d['factor'] +write('// LOAD_TABLE2') +definemultiline(F'LOAD_TABLE2') +asmopen() +table0.loadtable(2) +asmclose() +newline() + +d['factor'] = 0 +d['cycles_PERM'] += 1 * d['factor'] +write('// LOAD_TABLE3') +definemultiline(F'LOAD_TABLE3') +asmopen() +table0.loadtable(3) +asmclose() +newline() + +d['factor'] = 2 # factor is 2 +d['cycles_PERM'] += 6 * d['factor'] +write('// PERMUTE') +definemultiline(F'PERMUTE_{PRECSUFFIX}') +debugall('PERM PRE', group='Chi') +asmopen() +#table0.loadtable(2) +Chi_00.permute(2, table0) +Chi_01.permute(2, table0) +Chi_02.permute(2, table0) +Chi_10.permute(2, table0) +Chi_11.permute(2, table0) +Chi_12.permute(2, table0) +asmclose() +debugall('PERM POST', group='Chi') +newline() + +write('// LOAD_GAUGE') +definemultiline(F'LOAD_GAUGE') +if GRIDBENCH: # referencing differs in Grid and GridBench + write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') +else: + write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') +curlyopen() +asmopen() +pg1.loadpredication() +fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') +if ASM_LOAD_GAUGE: + fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') + U_00.load("ref[0][0]") + U_10.load("ref[1][0]") + U_20.load("ref[2][0]") + U_01.load("ref[0][1]") + U_11.load("ref[1][1]") + U_21.load("ref[2][1]") +asmclose() +curlyclose() +newline() + +d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total +# assume all U loads are hidden +# FCMLA issue latency = 2 cycles +# measurement: latency = 16 cycles if FULLY pipelined !? +# spec says 6+6+9 cycles +# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9 +d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor'] +write('// MULT_2SPIN') +definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)') +curlyopen() +#write(' const auto & ref(U[sU][A]); \\') +if GRIDBENCH: # referencing differs in Grid and GridBench + write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') +else: + write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') +asmopen() +#pg1.loadpredication() +#fetch_base_ptr("&ref[0][0]") +fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') +fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') +#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='I') +#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='A') +#fetch_base_ptr(F"&ref[0][{FETCH_BASE_PTR_COLOR_OFFSET}]") +if ASM_LOAD_GAUGE: + U_00.load("ref[0][0]") + U_10.load("ref[1][0]") + U_20.load("ref[2][0]") + U_01.load("ref[0][1]") + U_11.load("ref[1][1]") + U_21.load("ref[2][1]") + +if MOVPRFX == False: + UChi_00.zero() # implementation specific + UChi_10.zero() + UChi_01.zero() + UChi_11.zero() + UChi_02.zero() + UChi_12.zero() + + # round 1 + UChi_00.mul0(U_00, Chi_00) # FCMLA latency is 6+6+9 cycles + UChi_10.mul0(U_00, Chi_10) + UChi_01.mul0(U_10, Chi_00) + UChi_11.mul0(U_10, Chi_10) + UChi_02.mul0(U_20, Chi_00) + UChi_12.mul0(U_20, Chi_10) +else: + # round 1 + UChi_00.mul0(zero0, U_00, Chi_00, constructive=True) # FCMLA latency is 6+6+9 cycles + UChi_10.mul0(zero0, U_00, Chi_10, constructive=True) + UChi_01.mul0(zero0, U_10, Chi_00, constructive=True) + UChi_11.mul0(zero0, U_10, Chi_10, constructive=True) + UChi_02.mul0(zero0, U_20, Chi_00, constructive=True) + UChi_12.mul0(zero0, U_20, Chi_10, constructive=True) + +# round 2 +UChi_00.mul1(U_00, Chi_00) +UChi_10.mul1(U_00, Chi_10) +UChi_01.mul1(U_10, Chi_00) +UChi_11.mul1(U_10, Chi_10) +UChi_02.mul1(U_20, Chi_00) +UChi_12.mul1(U_20, Chi_10) # Chi_00 and Chi_10 available from here + +if ASM_LOAD_GAUGE: + U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded + U_10.load("ref[1][2]") # early load + U_20.load("ref[2][2]") # A --> +asmclose() +debugall('MULT_2SPIN_1', group='UChi') +curlyclose() +newline() + +write('// MULT_2SPIN_BACKEND') +definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}') +curlyopen() +asmopen() +# round 3 +UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and +UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90) +UChi_01.mac0(U_11, Chi_01) # autonomously using intrinsics +UChi_11.mac0(U_11, Chi_11) +UChi_02.mac0(U_21, Chi_01) +UChi_12.mac0(U_21, Chi_11) +# round 4 +UChi_00.mac1(U_01, Chi_01) +UChi_10.mac1(U_01, Chi_11) +UChi_01.mac1(U_11, Chi_01) +UChi_11.mac1(U_11, Chi_11) +UChi_02.mac1(U_21, Chi_01) +UChi_12.mac1(U_21, Chi_11) +# round 5 +UChi_00.mac0(U_00, Chi_02) # <-- A +UChi_10.mac0(U_00, Chi_12) +UChi_01.mac0(U_10, Chi_02) +UChi_11.mac0(U_10, Chi_12) +UChi_02.mac0(U_20, Chi_02) +UChi_12.mac0(U_20, Chi_12) +# round 6 +UChi_00.mac1(U_00, Chi_02) +UChi_10.mac1(U_00, Chi_12) +UChi_01.mac1(U_10, Chi_02) +UChi_11.mac1(U_10, Chi_12) +UChi_02.mac1(U_20, Chi_02) +UChi_12.mac1(U_20, Chi_12) +asmclose() +debugall('MULT_2SPIN_2', group='UChi') +curlyclose() +newline() + + +#// hspin(0)=fspin(0)+timesI(fspin(3)); +#// hspin(1)=fspin(1)+timesI(fspin(2)); +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_PROJ'] += 15 * d['factor'] +write('// XP_PROJ') +definemultiline(F'XP_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0312_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.addTimesI(Chimu_00, Chimu_30) +Chi_01.addTimesI(Chimu_01, Chimu_31) +Chi_02.addTimesI(Chimu_02, Chimu_32) +Chi_10.addTimesI(Chimu_10, Chimu_20) +Chi_11.addTimesI(Chimu_11, Chimu_21) +Chi_12.addTimesI(Chimu_12, Chimu_22) +asmclose() +debugall('XP_PROJ', group='Chi') +curlyclose() +newline() + +#// fspin(0)=hspin(0); +#// fspin(1)=hspin(1); +#// fspin(2)=timesMinusI(hspin(1)); +#// fspin(3)=timesMinusI(hspin(0)); +# does not occur in GridBench +d['factor'] = 0 +d['cycles_RECON'] += 15 * d['factor'] +write('// XP_RECON') +definemultiline(F'XP_RECON_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +if MOVPRFX == False: + result_20.zero() + result_21.zero() + result_22.zero() + result_30.zero() + result_31.zero() + result_32.zero() + + result_20.subTimesI(UChi_10) + result_21.subTimesI(UChi_11) + result_22.subTimesI(UChi_12) + result_30.subTimesI(UChi_00) + result_31.subTimesI(UChi_01) + result_32.subTimesI(UChi_02) +else: + result_20.subTimesI(zero0, UChi_10, constructive=True) + result_21.subTimesI(zero0, UChi_11, constructive=True) + result_22.subTimesI(zero0, UChi_12, constructive=True) + result_30.subTimesI(zero0, UChi_00, constructive=True) + result_31.subTimesI(zero0, UChi_01, constructive=True) + result_32.subTimesI(zero0, UChi_02, constructive=True) + +result_00.move(UChi_00) # don't reorder ! +result_01.move(UChi_01) +result_02.move(UChi_02) +result_10.move(UChi_10) +result_11.move(UChi_11) +result_12.move(UChi_12) + +# result_00.add(UChi_00) # faster than move? +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +asmclose() +debugall('XP_RECON', group='result') +newline() + + +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_RECON'] += 15 * d['factor'] +write('// XP_RECON_ACCUM') +definemultiline(F'XP_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_20.subTimesI(UChi_10) +# result_21.subTimesI(UChi_11) +# result_22.subTimesI(UChi_12) +# result_30.subTimesI(UChi_00) +# result_31.subTimesI(UChi_01) +# result_32.subTimesI(UChi_02) +# +# result_00.add(UChi_00) # reordered +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) + +result_30.subTimesI(UChi_00) # reordered +result_00.add(UChi_00) + +result_31.subTimesI(UChi_01) +result_01.add(UChi_01) + +result_32.subTimesI(UChi_02) +result_02.add(UChi_02) + +result_20.subTimesI(UChi_10) +result_10.add(UChi_10) + +result_21.subTimesI(UChi_11) +result_11.add(UChi_11) + +result_22.subTimesI(UChi_12) +result_12.add(UChi_12) +asmclose() +debugall('XP_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +# add/sub issue latency = 1, latency is 9 +d['cycles_PROJ'] += 9 * d['factor'] +write('// YP_PROJ') +definemultiline(F'YP_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0312_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.sub(Chimu_00, Chimu_30) +Chi_01.sub(Chimu_01, Chimu_31) +Chi_02.sub(Chimu_02, Chimu_32) +Chi_10.add(Chimu_10, Chimu_20) +Chi_11.add(Chimu_11, Chimu_21) +Chi_12.add(Chimu_12, Chimu_22) +asmclose() +debugall('YP_PROJ', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_PROJ'] += 15 * d['factor'] +write('// ZP_PROJ') +definemultiline(F'ZP_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0213_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.addTimesI(Chimu_00, Chimu_20) +Chi_01.addTimesI(Chimu_01, Chimu_21) +Chi_02.addTimesI(Chimu_02, Chimu_22) +Chi_10.subTimesI(Chimu_10, Chimu_30) +Chi_11.subTimesI(Chimu_11, Chimu_31) +Chi_12.subTimesI(Chimu_12, Chimu_32) +asmclose() +debugall('ZP_PROJ', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +# add/sub issue latency = 1, latency is 9 +d['cycles_PROJ'] += 9 * d['factor'] +write('// TP_PROJ') +definemultiline(F'TP_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0213_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.add(Chimu_00, Chimu_20) +Chi_01.add(Chimu_01, Chimu_21) +Chi_02.add(Chimu_02, Chimu_22) +Chi_10.add(Chimu_10, Chimu_30) +Chi_11.add(Chimu_11, Chimu_31) +Chi_12.add(Chimu_12, Chimu_32) +asmclose() +debugall('TP_PROJ', group='Chi') +curlyclose() +newline() + +#// hspin(0)=fspin(0)-timesI(fspin(3)); +#// hspin(1)=fspin(1)-timesI(fspin(2)); + +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_PROJ'] += 15 * d['factor'] +write('// XM_PROJ') +definemultiline(F'XM_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0312_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.subTimesI(Chimu_00, Chimu_30) +Chi_01.subTimesI(Chimu_01, Chimu_31) +Chi_02.subTimesI(Chimu_02, Chimu_32) +Chi_10.subTimesI(Chimu_10, Chimu_20) +Chi_11.subTimesI(Chimu_11, Chimu_21) +Chi_12.subTimesI(Chimu_12, Chimu_22) +asmclose() +debugall('XM_PROJ sub', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 15 * d['factor'] +write('// XM_RECON') +definemultiline(F'XM_RECON_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() + +# only necessary if not zeroed before +if MOVPRFX == False: + result_20.zero() + result_21.zero() + result_22.zero() + result_30.zero() + result_31.zero() + result_32.zero() + + result_20.addTimesI(UChi_10) # <-- + result_21.addTimesI(UChi_11) + result_22.addTimesI(UChi_12) + result_30.addTimesI(UChi_00) + result_31.addTimesI(UChi_01) + result_32.addTimesI(UChi_02) +else: + result_20.addTimesI(zero0, UChi_10, constructive=True) # <-- + result_21.addTimesI(zero0, UChi_11, constructive=True) + result_22.addTimesI(zero0, UChi_12, constructive=True) + result_30.addTimesI(zero0, UChi_00, constructive=True) + result_31.addTimesI(zero0, UChi_01, constructive=True) + result_32.addTimesI(zero0, UChi_02, constructive=True) + +result_00.move(UChi_00) +result_01.move(UChi_01) +result_02.move(UChi_02) +result_10.move(UChi_10) +result_11.move(UChi_11) +result_12.move(UChi_12) +asmclose() +debugall('XM_RECON result', group='result') +newline() + +d['factor'] = 1 +# add/sub issue latency = 1, latency is 9 +d['cycles_PROJ'] += 9 * d['factor'] +write('// YM_PROJ') +definemultiline(F'YM_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0312_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.add(Chimu_00, Chimu_30) +Chi_01.add(Chimu_01, Chimu_31) +Chi_02.add(Chimu_02, Chimu_32) +Chi_10.sub(Chimu_10, Chimu_20) +Chi_11.sub(Chimu_11, Chimu_21) +Chi_12.sub(Chimu_12, Chimu_22) +asmclose() +debugall('YM_PROJ', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_PROJ'] += 15 * d['factor'] +write('// ZM_PROJ') +definemultiline(F'ZM_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0213_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.subTimesI(Chimu_00, Chimu_20) +Chi_01.subTimesI(Chimu_01, Chimu_21) +Chi_02.subTimesI(Chimu_02, Chimu_22) +Chi_10.addTimesI(Chimu_10, Chimu_30) +Chi_11.addTimesI(Chimu_11, Chimu_31) +Chi_12.addTimesI(Chimu_12, Chimu_32) +asmclose() +debugall('ZM_PROJ', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +# add/sub issue latency = 1, latency is 9 +d['cycles_PROJ'] += 9 * d['factor'] +write('// TM_PROJ') +definemultiline(F'TM_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0213_PLUG \\') +curlyopen() +asmopen() +pg1.loadpredication() +Chi_00.sub(Chimu_00, Chimu_20) +Chi_01.sub(Chimu_01, Chimu_21) +Chi_02.sub(Chimu_02, Chimu_22) +Chi_10.sub(Chimu_10, Chimu_30) +Chi_11.sub(Chimu_11, Chimu_31) +Chi_12.sub(Chimu_12, Chimu_32) +asmclose() +debugall('TM_PROJ', group='Chi') +curlyclose() +newline() + +# does not occur in GridBench +d['factor'] = 0 +# add/sub issue latency = 1, latency is 9 +d['cycles_RECON'] += 15 * d['factor'] +write('// XM_RECON_ACCUM') +definemultiline(F'XM_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +# result_20.addTimesI(UChi_10) +# result_21.addTimesI(UChi_11) +# result_22.addTimesI(UChi_12) +# result_30.addTimesI(UChi_00) +# result_31.addTimesI(UChi_01) +# result_32.addTimesI(UChi_02) +# +# # result_00.move(UChi_00) +# # result_01.move(UChi_01) +# # result_02.move(UChi_02) +# # result_10.move(UChi_10) +# # result_11.move(UChi_11) +# # result_12.move(UChi_12) +# +# # faster than move ? +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) + +result_30.addTimesI(UChi_00) # reordered +result_31.addTimesI(UChi_01) +result_32.addTimesI(UChi_02) + +result_20.addTimesI(UChi_10) +result_21.addTimesI(UChi_11) +result_22.addTimesI(UChi_12) + +result_00.add(UChi_00) +result_01.add(UChi_01) +result_02.add(UChi_02) +result_10.add(UChi_10) +result_11.add(UChi_11) +result_12.add(UChi_12) +asmclose() +debugall('XM_RECON_ACCUM', group='result') +newline() + + + +d['factor'] = 1 +d['cycles_RECON'] += 9 * d['factor'] +write('// YP_RECON_ACCUM') +definemultiline(F'YP_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +# result_20.add(UChi_10) +# result_21.add(UChi_11) +# result_22.add(UChi_12) +# result_30.sub(UChi_00) +# result_31.sub(UChi_01) +# result_32.sub(UChi_02) + +result_00.add(UChi_00) # reordered +result_30.sub(UChi_00) + +result_01.add(UChi_01) +result_31.sub(UChi_01) + +result_02.add(UChi_02) +result_32.sub(UChi_02) + +result_10.add(UChi_10) +result_20.add(UChi_10) + +result_11.add(UChi_11) +result_21.add(UChi_11) + +result_12.add(UChi_12) +result_22.add(UChi_12) +asmclose() +debugall('YP_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 9 * d['factor'] +write('// YM_RECON_ACCUM') +definemultiline(F'YM_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +# result_20.sub(UChi_10) +# result_21.sub(UChi_11) +# result_22.sub(UChi_12) +# result_30.add(UChi_00) +# result_31.add(UChi_01) +# result_32.add(UChi_02) + +result_00.add(UChi_00) # reordered +result_30.add(UChi_00) + +result_01.add(UChi_01) +result_31.add(UChi_01) + +result_02.add(UChi_02) +result_32.add(UChi_02) + +result_10.add(UChi_10) +result_20.sub(UChi_10) + +result_11.add(UChi_11) +result_21.sub(UChi_11) + +result_12.add(UChi_12) +result_22.sub(UChi_12) +asmclose() +debugall('YM_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 15 * d['factor'] +write('// ZP_RECON_ACCUM') +definemultiline(F'ZP_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_20.subTimesI(UChi_00) +# result_21.subTimesI(UChi_01) +# result_22.subTimesI(UChi_02) +# result_30.addTimesI(UChi_10) +# result_31.addTimesI(UChi_11) +# result_32.addTimesI(UChi_12) +# +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +result_20.subTimesI(UChi_00) # reordered +result_00.add(UChi_00) + +result_21.subTimesI(UChi_01) +result_01.add(UChi_01) + +result_22.subTimesI(UChi_02) +result_02.add(UChi_02) + +result_30.addTimesI(UChi_10) +result_10.add(UChi_10) + +result_31.addTimesI(UChi_11) +result_11.add(UChi_11) + +result_32.addTimesI(UChi_12) +result_12.add(UChi_12) +asmclose() +debugall('ZP_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 15 * d['factor'] +write('// ZM_RECON_ACCUM') +definemultiline(F'ZM_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_20.addTimesI(UChi_00) +# result_21.addTimesI(UChi_01) +# result_22.addTimesI(UChi_02) +# result_30.subTimesI(UChi_10) +# result_31.subTimesI(UChi_11) +# result_32.subTimesI(UChi_12) +# +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +result_20.addTimesI(UChi_00) # reordered +result_00.add(UChi_00) + +result_21.addTimesI(UChi_01) +result_01.add(UChi_01) + +result_22.addTimesI(UChi_02) +result_02.add(UChi_02) + +result_30.subTimesI(UChi_10) +result_10.add(UChi_10) + +result_31.subTimesI(UChi_11) +result_11.add(UChi_11) + +result_32.subTimesI(UChi_12) +result_12.add(UChi_12) +asmclose() +debugall('ZM_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 9 * d['factor'] +write('// TP_RECON_ACCUM') +definemultiline(F'TP_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +# result_20.add(UChi_00) +# result_21.add(UChi_01) +# result_22.add(UChi_02) +# result_30.add(UChi_10) +# result_31.add(UChi_11) +# result_32.add(UChi_12) + +result_00.add(UChi_00) # reordered +result_20.add(UChi_00) + +result_01.add(UChi_01) +result_21.add(UChi_01) + +result_02.add(UChi_02) +result_22.add(UChi_02) + +result_10.add(UChi_10) +result_30.add(UChi_10) + +result_11.add(UChi_11) +result_31.add(UChi_11) + +result_12.add(UChi_12) +result_32.add(UChi_12) +asmclose() +debugall('TP_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 9 * d['factor'] +write('// TM_RECON_ACCUM') +definemultiline(F'TM_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +# result_20.sub(UChi_00) +# result_21.sub(UChi_01) +# result_22.sub(UChi_02) +# result_30.sub(UChi_10) +# result_31.sub(UChi_11) +# result_32.sub(UChi_12) + +result_00.add(UChi_00) # reordered +result_20.sub(UChi_00) + +result_01.add(UChi_01) +result_21.sub(UChi_01) + +result_02.add(UChi_02) +result_22.sub(UChi_02) + +result_10.add(UChi_10) +result_30.sub(UChi_10) + +result_11.add(UChi_11) +result_31.sub(UChi_11) + +result_12.add(UChi_12) +result_32.sub(UChi_12) +asmclose() +debugall('TM_RECON_ACCUM', group='result') +newline() + +d['factor'] = 0 +# have 12 instructions +# picking dual issue versions +d['cycles_ZERO_PSI'] += 6 * d['factor'] +write('// ZERO_PSI') +definemultiline(F'ZERO_PSI_{PRECSUFFIX}') +asmopen() +pg1.loadpredication() +result_00.zero() +result_01.zero() +result_02.zero() +result_10.zero() +result_11.zero() +result_12.zero() +result_20.zero() +result_21.zero() +result_22.zero() +result_30.zero() +result_31.zero() +result_32.zero() +asmclose() +#debugall('ZERO_PSI', group='result') +newline() + +# prefetch store spinors to L2 cache +d['factor'] = 0 +d['cycles_PREFETCH_L2'] += 0 * d['factor'] +write('// PREFETCH_RESULT_L2_STORE (prefetch store to L2)') +definemultiline(F'PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +fetch_base_ptr(F"base", target='A') +prefetch_L2_store(F"base", 0) +prefetch_L2_store(F"base", 1) +prefetch_L2_store(F"base", 2) +asmclose() +curlyclose() +newline() + +# prefetch store spinors to L1 cache +d['factor'] = 0 +d['cycles_PREFETCH_L1'] += 0 * d['factor'] +write('// PREFETCH_RESULT_L1_STORE (prefetch store to L1)') +definemultiline(F'PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +fetch_base_ptr(F"base", target='A') +prefetch_L1_store(F"base", 0) +prefetch_L1_store(F"base", 1) +prefetch_L1_store(F"base", 2) +asmclose() +curlyclose() +newline() + + +d['factor'] = 0 +write('// ADD_RESULT_INTERNAL') +definemultiline(F'ADD_RESULT_INTERNAL_{PRECSUFFIX}') +asmopen() +result_00.add(Chimu_00) +result_01.add(Chimu_01) +result_02.add(Chimu_02) +result_10.add(Chimu_10) +result_11.add(Chimu_11) +result_12.add(Chimu_12) +result_20.add(Chimu_20) +result_21.add(Chimu_21) +result_22.add(Chimu_22) +result_30.add(Chimu_30) +result_31.add(Chimu_31) +result_32.add(Chimu_32) +asmclose() +#debugall('ZERO_PSI', group='result') +newline() + +# -------------------------------------------------------------------------------- + +# C +f = open('w.h', 'w') +f.write(d['C']) +f.close() + +# intrin +f = open('wi.h', 'w') +f.write(d['I']) +f.close() + +filename = '' +if PRECISION == 'double': + filename = "Fujitsu_A64FX_intrin_double.h" +else: + filename = "Fujitsu_A64FX_intrin_single.h" +f = open(filename, 'w') +f.write(LEGAL.format(filename)) +f.write(d['I']) +f.close() + + +# asm +f = open('wa.h', 'w') +f.write(d['A']) +f.close() + +filename = '' +if PRECISION == 'double': + filename = "Fujitsu_A64FX_asm_double.h" +else: + filename = "Fujitsu_A64FX_asm_single.h" +f = open(filename, 'w') +f.write(LEGAL.format(filename)) +f.write(d['A']) +f.close() + + +# arithmetics instruction count, mul/mac = 2 instructions each +d['acount'] = d['add'] + d['sub'] + \ + d['mul'] + d['mac'] + d['addTimesI'] + d['subTimesI'] + +# permutations +d['permutes'] += 2*d['timesI'] + 1*d['timesMinusI'] +d['neg'] = 1*d['timesI'] + 1*d['timesMinusI'] + +# instruction count, mul/mac = 2 instructions each, +/- *i = 3 instructions each +d['icount'] = d['load'] + d['store'] + d['move'] + d['add'] + d['sub'] + \ + d['mul'] + d['mac'] + d['permutes'] + d['neg'] + \ + d['addTimesI'] + d['subTimesI'] + d['zero'] + d['movprfx'] + +# flops +d['flops'] = 4*d['mac'] + 3*d['mul'] + d['add'] + d['sub'] + \ + d['addTimesI'] + d['subTimesI'] + + + + + +print('Statistics') +print('') +print('Type Occurences Total / Arith instructions') +print('-------------------------------------------------------------------') +print('Variables {:4d}'.format(d['registers'])) +print('') +print('load {:4d}'.format(d['load'])) +print('store {:4d}'.format(d['store'])) +print('move {:4d}'.format(d['move'])) +print('movprfx {:4d}'.format(d['movprfx'])) +print('zero {:4d}'.format(d['zero'])) +print('negate {:4d}'.format(d['neg'])) + + +print('add {:4d} {:0.2f} / {:0.2f}'.\ + format(d['add'], d['add'] / d['icount'], d['add'] / d['acount'])) +print('sub {:4d} {:0.2f} / {:0.2f}'.\ + format(d['sub'], d['sub'] / d['icount'], d['sub'] / d['acount'])) +print('mul {:4d} {:0.2f} / {:0.2f}'.\ + format(d['mul'], 2*d['mul'] / d['icount'], 2*d['mul'] / d['acount'])) +print('mac {:4d} {:0.2f} / {:0.2f}'.\ + format(d['mac'], 2*d['mac'] / d['icount'], 2*d['mac'] / d['acount'])) +print('addTimesI {:4d} {:0.2f} / {:0.2f}'.\ + format(d['addTimesI'], 2*d['addTimesI'] / d['icount'], 2*d['addTimesI'] / d['acount'])) +print('subTimesI {:4d} {:0.2f} / {:0.2f}'.\ + format(d['subTimesI'], 2*d['subTimesI'] / d['icount'], 2*d['subTimesI'] / d['acount'])) + +print('timesI {:4d}'.format(d['timesI'])) +print('timesMinusI {:4d}'.format(d['timesMinusI'])) +print('permutes {:4d} {:0.2f}'.\ + format(d['permutes'], d['permutes'] / d['icount'])) +print('') +print('flops {:4d}'.format(d['flops'])) +print('instruction count {:4d}'.format(d['icount'])) +print('arith. instruction count {:4d} {:0.2f}'.\ + format(d['acount'], d['acount'] / d['icount'])) + + +# ---- static pipeline resources consumption ---- +FLA = 0 +FLA += 2 * d['mac'] + 2 * d['mul'] +FLA += 1 * d['addTimesI'] + 1 * d['subTimesI'] +FLA += 1 * d['move'] +FLA += 1 * d['permutes'] +FLA += 1 * d['store'] +FLA += 1 * d['zero'] + +FLB = 0 +FLB += 1 * d['addTimesI'] + 1 * d['subTimesI'] + +FLAB = 0 +FLAB += 1 * d['mac'] + 1 * d['mul'] +FLAB += 1 * d['add'] + 1 * d['sub'] +FLAB += 1 * d['neg'] + 1 * d['movprfx'] +#FLAB += 1 * d['zero'] + + +FL_slots = 2 * d['icount'] +FL_micro_ops = FLA + FLB + FLAB + +print('') +print('------------------------------------------------------------------') +print('') +print('Static FL slot usage') +print('') +print(' FLA {:4d}'.format(FLA)) +print(' FLB {:4d}'.format(FLB)) +print(' FLA/B {:4d}'.format(FLAB)) + +print('') +print('Static FL slot efficiency') +print('') +print(' Total FL slots {:4d}'.format(FL_slots)) +print(' FL slots occupied {:4d}'.format(FL_micro_ops)) +print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / FL_slots)) + +cycles_total = d['cycles_ZERO_PSI'] + d['cycles_LOAD_CHIMU'] + \ + d['cycles_PROJ'] + d['cycles_PERM'] + d['cycles_MULT_2SPIN'] + \ + d['cycles_RECON'] + d['cycles_RESULT'] +cycles_total_hidden = d['cycles_ZERO_PSI'] + \ + d['cycles_PROJ'] + d['cycles_MULT_2SPIN'] + \ + d['cycles_RECON'] + +# ---- dynamic estimate ---- + +print('') +print('Dynamic cycles estimate (incl. latencies)') +print('') +print(' ZERO_PSI {:4d}'.format(d['cycles_ZERO_PSI'])) +print(' LOAD_CHIMU {:4d}'.format(d['cycles_LOAD_CHIMU'])) +print(' PROJ {:4d}'.format(d['cycles_PROJ'])) +print(' PERM {:4d}'.format(d['cycles_PERM'])) +print(' MULT_2SPIN {:4d}'.format(d['cycles_MULT_2SPIN'])) +print(' RECON {:4d}'.format(d['cycles_RECON'])) +print(' STORE {:4d}'.format(d['cycles_RESULT'])) +print('') +print(' Sum {:4d}'.format(cycles_total)) +print('') +print(' Sum* {:4d}'.format(cycles_total_hidden)) +print(' Total FL slots* {:4d}'.format(cycles_total_hidden * 2)) +print(' FL slots occupied* {:4d}'.format(FL_micro_ops)) +print(' FL slot efficiency* {:0.2f}'.format(FL_micro_ops / (2*cycles_total_hidden))) +print('') +print(' *load/store/PERM hidden') + +estimated_cycles = cycles_total_hidden +# Estimate percent peak DP; dual issue, fma +pp = 100 * 4 * d['flops'] / (2*2*8*estimated_cycles) +print('') +print('Model prediction') +print('') +print(' Cycles* {:4d}'.format(estimated_cycles)) +print(' Percent peak* {:4.1f} %'.format(pp)) + +# estimated RF throughput in GB/s @ 2.2 GHz +tp10 = (d['load'] + d['store']) * 64 * 2.2 / estimated_cycles +tp2 = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / estimated_cycles +print('') +print(' Estimated RF throughput* {:4.1f} GB/s'.\ + format(tp10)) +print(' Estimated RF throughput* {:4.1f} GiB/s'.\ + format(tp2)) + +# ---- dynamic pipeline resources consumption ---- + +runtime = measured_cycles # runtime in cycles +pp_runtime = 100 * 4 * d['flops'] / (2*2*8*runtime) +runtime_FL_slots = 2 * runtime +delta = runtime - estimated_cycles + + +print('') +print('------------------------------------------------------------------') +print('') +print('Dynamic runtime analysis (cycles from measurements)') +print('') +print(' Cycles {:4d}'.format(runtime)) +print(' Percent peak {:4.1f} %'.format(pp_runtime)) +print(' Deviation from estimate {:4d} {:4.2f} %'.\ + format(delta, 100. * abs(delta/runtime))) +print(' Deviation per direction {:4.1f}'.format(delta/8)) + +# estimated RF throughput in GB/s @ 2.2 GHz +tp10_rt = (d['load'] + d['store']) * 64 * 2.2 / runtime +tp2_rt = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / runtime +print('') +print(' RF throughput {:4.1f} GB/s'.\ + format(tp10_rt)) +print(' RF throughput {:4.1f} GiB/s'.\ + format(tp2_rt)) +print('') +print(' Total FL slots {:4d}'.format(runtime_FL_slots)) +print(' FL slots occupied {:4d}'.format(FL_micro_ops)) +print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / runtime_FL_slots)) +print('') diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 3b9ae08e..1e198972 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/Stencil.h @@ -41,13 +41,13 @@ // Stencil based code will exchange haloes and use a table lookup for neighbours. // This will be done with generality to allow easier efficient implementations. // Overlap of comms and compute is enabled by tabulating off-node connected, -// +// // Generic services // 0) Prebuild neighbour tables // 1) Compute sizes of all haloes/comms buffers; allocate them. // 2) Gather all faces, and communicate. // 3) Loop over result sites, giving nbr index/offnode info for each -// +// ////////////////////////////////////////////////////////////////////////////////////////// NAMESPACE_BEGIN(Grid); @@ -59,10 +59,10 @@ NAMESPACE_BEGIN(Grid); void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,Vector > & table); -template +template void Gather_plane_simple_table (Vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline)); -template +template void Gather_plane_simple_table (Vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) { int num=table.size(); @@ -94,13 +94,13 @@ void Gather_plane_exchange_table(Vector >& table,const Lattic { assert( (table.size()&0x1)==0); int num=table.size()/2; - int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane + int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane auto rhs_v = rhs.View(AcceleratorRead); auto p0=&pointers[0][0]; auto p1=&pointers[1][0]; auto tp=&table[0]; - accelerator_forNB(j, num, 1, { + accelerator_forNB(j, num, 1, { compress.CompressExchange(p0,p1, &rhs_v[0], j, so+tp[2*j ].second, so+tp[2*j+1].second, @@ -109,20 +109,20 @@ void Gather_plane_exchange_table(Vector >& table,const Lattic rhs_v.ViewClose(); } -struct StencilEntry { +struct StencilEntry { #ifdef GRID_CUDA - uint64_t _byte_offset; // 8 bytes - uint32_t _offset; // 4 bytes + uint64_t _byte_offset; // 8 bytes + uint32_t _offset; // 4 bytes #else - uint64_t _byte_offset; // 8 bytes + uint64_t _byte_offset; // 8 bytes uint64_t _offset; // 8 bytes (8 ever required?) #endif - uint8_t _is_local; // 1 bytes + uint8_t _is_local; // 1 bytes uint8_t _permute; // 1 bytes uint8_t _around_the_world; // 1 bytes uint8_t _pad; // 1 bytes }; -// Could pack to 8 + 4 + 4 = 128 bit and use +// Could pack to 8 + 4 + 4 = 128 bit and use template class CartesianStencilAccelerator { @@ -149,18 +149,18 @@ class CartesianStencilAccelerator { accelerator_inline cobj *CommBuf(void) { return u_recv_buf_p; } - accelerator_inline int GetNodeLocal(int osite,int point) { + accelerator_inline int GetNodeLocal(int osite,int point) { return this->_entries_p[point+this->_npoints*osite]._is_local; } - accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) { - ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; + accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) { + ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; } accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; local = this->_entries_p[ent]._is_local; perm = this->_entries_p[ent]._permute; - if (perm) ptype = this->_permute_type[point]; + if (perm) ptype = this->_permute_type[point]; if (local) { return base + this->_entries_p[ent]._byte_offset; } else { @@ -175,14 +175,14 @@ class CartesianStencilAccelerator { else return cbase + this->_entries_p[ent]._byte_offset; } - accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) + accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) { Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout); } }; template -class CartesianStencilView : public CartesianStencilAccelerator +class CartesianStencilView : public CartesianStencilAccelerator { private: int *closed; @@ -192,7 +192,7 @@ class CartesianStencilView : public CartesianStencilAccelerator &refer_to_me,ViewMode _mode) + CartesianStencilView (const CartesianStencilAccelerator &refer_to_me,ViewMode _mode) : CartesianStencilAccelerator(refer_to_me), cpu_ptr(this->_entries_p), mode(_mode) @@ -201,14 +201,14 @@ class CartesianStencilView : public CartesianStencilAccelerator_entries_p, this->_npoints*this->_osites*sizeof(StencilEntry), mode, - AdviseDefault); + AdviseDefault); } - + void ViewClose(void) { - MemoryManager::ViewClose(this->cpu_ptr,this->mode); + MemoryManager::ViewClose(this->cpu_ptr,this->mode); } - + }; //////////////////////////////////////// @@ -245,12 +245,12 @@ public: cobj * mpi_p; Integer buffer_size; }; - + protected: GridBase * _grid; -public: +public: GridBase *Grid(void) const { return _grid; } //////////////////////////////////////////////////////////////////////// @@ -264,7 +264,7 @@ public: View_type accessor(*( (View_type *) this),mode); return accessor; } - + int face_table_computed; std::vector > > face_table ; Vector surface_list; @@ -314,7 +314,7 @@ public: //////////////////////////////////////// // Stencil query //////////////////////////////////////// - inline int SameNode(int point) { + inline int SameNode(int point) { int dimension = this->_directions[point]; int displacement = this->_distances[point]; @@ -338,7 +338,7 @@ public: // FIXME this logic needs to be sorted for three link term // assert( (displacement==1) || (displacement==-1)); // Present hack only works for >= 4^4 subvol per node - _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p); @@ -378,7 +378,7 @@ public: comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread]; } } - + void CollateThreads(void) { int nthreads = CartesianCommunicator::nCommThreads; @@ -402,7 +402,7 @@ public: if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen if ( t1 > last ) last = t1; // max time seen - + } commtime+= last-first; } @@ -464,30 +464,30 @@ public: this->CommunicateBegin(reqs); this->CommunicateComplete(reqs); } - } - - template void HaloExchange(const Lattice &source,compressor &compress) + } + + template void HaloExchange(const Lattice &source,compressor &compress) { Prepare(); HaloGather(source,compress); Communicate(); - CommsMergeSHM(compress); - CommsMerge(compress); + CommsMergeSHM(compress); + CommsMerge(compress); } - + template int HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) { int dimension = this->_directions[point]; int displacement = this->_distances[point]; - + int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; - + // Map to always positive shift modulo global full dimension. int shift = (displacement+fd)%fd; assert (source.Checkerboard()== this->_checkerboard); - + // the permute type int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; @@ -505,7 +505,7 @@ public: auto tmp = GatherSimd(source,dimension,shift,0x3,compress,face_idx); is_same_node = is_same_node && tmp; splicetime+=usecond(); - } else { + } else { nosplicetime-=usecond(); auto tmp = Gather(source,dimension,shift,0x3,compress,face_idx); is_same_node = is_same_node && tmp; @@ -531,7 +531,7 @@ public: } return is_same_node; } - + template void HaloGather(const Lattice &source,compressor &compress) { @@ -542,9 +542,9 @@ public: // conformable(source.Grid(),_grid); assert(source.Grid()==_grid); halogtime-=usecond(); - + u_comm_offset=0; - + // Gather all comms buffers int face_idx=0; for(int point = 0 ; point < this->_npoints; point++) { @@ -557,16 +557,16 @@ public: accelerator_barrier(); halogtime+=usecond(); } - + ///////////////////////// // Implementation ///////////////////////// void Prepare(void) { - Decompressions.resize(0); - DecompressionsSHM.resize(0); - Mergers.resize(0); - MergersSHM.resize(0); + Decompressions.resize(0); + DecompressionsSHM.resize(0); + Mergers.resize(0); + MergersSHM.resize(0); Packets.resize(0); calls++; } @@ -595,22 +595,22 @@ public: mv.push_back(m); } template void CommsMerge(decompressor decompress) { - CommsMerge(decompress,Mergers,Decompressions); + CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { - mpi3synctime-=usecond(); + mpi3synctime-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes - mpi3synctime+=usecond(); - shmmergetime-=usecond(); + mpi3synctime+=usecond(); + shmmergetime-=usecond(); CommsMerge(decompress,MergersSHM,DecompressionsSHM); - shmmergetime+=usecond(); + shmmergetime+=usecond(); } template - void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { + void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { mergetime-=usecond(); - for(int i=0;i_npoints;point++){ this->same_node[point] = this->SameNode(point); } - + for(int site = 0 ;site< vol4;site++){ int local = 1; for(int point=0;point_npoints;point++){ - if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ local = 0; } } - if(local == 0) { + if(local == 0) { surface_list.push_back(site); } } @@ -672,11 +672,11 @@ public: int checkerboard, const std::vector &directions, const std::vector &distances, - Parameters p) - : shm_bytes_thr(npoints), - comm_bytes_thr(npoints), + Parameters p) + : shm_bytes_thr(npoints), + comm_bytes_thr(npoints), comm_enter_thr(npoints), - comm_leave_thr(npoints), + comm_leave_thr(npoints), comm_time_thr(npoints) { face_table_computed=0; @@ -687,7 +687,7 @@ public: ///////////////////////////////////// this->_npoints = npoints; this->_comm_buf_size.resize(npoints), - this->_permute_type.resize(npoints), + this->_permute_type.resize(npoints), this->_simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels this->_directions = StencilVector(directions); this->_distances = StencilVector(distances); @@ -697,24 +697,24 @@ public: surface_list.resize(0); this->_osites = _grid->oSites(); - + _entries.resize(this->_npoints* this->_osites); this->_entries_p = &_entries[0]; for(int ii=0;ii_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; this->_permute_type[point]=_grid->PermuteType(dimension); - + this->_checkerboard = checkerboard; - + ////////////////////////// // the permute type ////////////////////////// @@ -724,25 +724,25 @@ public: int rotate_dim = _grid->_simd_layout[dimension]>2; assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported - + int sshift[2]; - + ////////////////////////// // Underlying approach. For each local site build - // up a table containing the npoint "neighbours" and whether they + // up a table containing the npoint "neighbours" and whether they // live in lattice or a comms buffer. ////////////////////////// if ( !comm_dim ) { sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even); sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd); - + if ( sshift[0] == sshift[1] ) { Local(point,dimension,shift,0x3); } else { Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes Local(point,dimension,shift,0x2);// both with block stride loop iteration } - } else { + } else { // All permute extract done in comms phase prior to Stencil application // So tables are the same whether comm_dim or splice_dim sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even); @@ -784,23 +784,23 @@ public: int ld = _grid->_ldimensions[dimension]; int gd = _grid->_gdimensions[dimension]; int ly = _grid->_simd_layout[dimension]; - + // Map to always positive shift modulo global full dimension. int shift = (shiftpm+fd)%fd; // the permute type int permute_dim =_grid->PermuteDim(dimension); - - for(int x=0;x_ostride[dimension]; - + int cb= (cbmask==0x2)? Odd : Even; - + int sshift = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,cb); int sx = (x+sshift)%rd; - + int wraparound=0; if ( (shiftpm==-1) && (sx>x) ) { wraparound = 1; @@ -808,7 +808,7 @@ public: if ( (shiftpm== 1) && (sxNsimd(); - + int fd = _grid->_fdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; int rd = _grid->_rdimensions[dimension]; int pd = _grid->_processors[dimension]; int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; - + assert(comm_dim==1); int shift = (shiftpm + fd) %fd; assert(shift>=0); assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; + int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; this->_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and // send to one or more remote nodes. - + int cb= (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,cb); - - for(int x=0;xPermuteType(dimension); - + int sx = (x+sshift)%rd; - + int offnode = 0; if ( simd_layout > 1 ) { - + for(int i=0;i>(permute_type+1)); int ic= (i&inner_bit)? 1:0; int my_coor = rd*ic + x; int nbr_coor = my_coor+sshift; int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - - if ( nbr_proc ) { + + if ( nbr_proc ) { offnode =1; } } - - } else { + + } else { int comm_proc = ((x+sshift)/rd)%pd; offnode = (comm_proc!= 0); } - + int wraparound=0; if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { wraparound = 1; @@ -884,24 +884,24 @@ public: wraparound = 1; } if (!offnode) { - + int permute_slice=0; - CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); - + CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); + } else { int words = buffer_size; if (cbmask != 0x3) words=words>>1; - + // int rank = grid->_processor; // int recv_from_rank; // int xmit_to_rank; - + int unified_buffer_offset = _unified_buffer_size; _unified_buffer_size += words; - + ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase - + } } } @@ -909,13 +909,13 @@ public: void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap) { int rd = _grid->_rdimensions[dimension]; - + if ( !_grid->CheckerBoarded(dimension) ) { - + int o = 0; // relative offset to base within plane - int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*_grid->_ostride[dimension]; // offset in buffer - + // Simple block stride gather of SIMD objects for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ @@ -927,18 +927,18 @@ public: } o +=_grid->_slice_stride[dimension]; } - + } else { - - int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane + + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane int o = 0; // relative offset to base within plane - + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ - + int ocb=1<<_grid->CheckerBoardFromOindex(o+b); - + if ( ocb&cbmask ) { int idx = point+(lo+o+b)*this->_npoints; _entries[idx]._offset =ro+o+b; @@ -946,24 +946,24 @@ public: _entries[idx]._permute=permute; _entries[idx]._around_the_world=wrap; } - + } o +=_grid->_slice_stride[dimension]; } - + } } // Routine builds up integer table for each site in _offsets, _is_local, _permute void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap) { int rd = _grid->_rdimensions[dimension]; - + if ( !_grid->CheckerBoarded(dimension) ) { - - int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane int o = 0; // relative offset to base within plane int bo = 0; // offset in buffer - + // Simple block stride gather of SIMD objects for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ @@ -975,16 +975,16 @@ public: } o +=_grid->_slice_stride[dimension]; } - - } else { - - int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + + } else { + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane int o = 0; // relative offset to base within plane int bo = 0; // offset in buffer - + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ - + int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb & cbmask ) { int idx = point+(so+o+b)*this->_npoints; @@ -998,16 +998,16 @@ public: } } } - + template int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) { typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; - + assert(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); - + int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; int pd = _grid->_processors[dimension]; @@ -1019,37 +1019,37 @@ public: assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; - + int cb= (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - + int shm_receive_only = 1; - for(int x=0;x>1; - + int bytes = words * compress.CommDatumSize(); - - int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane + + int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane if ( !face_table_computed ) { face_table.resize(face_idx+1); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); } - + // int rank = _grid->_processor; int recv_from_rank; int xmit_to_rank; _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - + assert (xmit_to_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank()); - + ///////////////////////////////////////////////////////// // try the direct copy if possible ///////////////////////////////////////////////////////// @@ -1062,13 +1062,13 @@ public: } send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf); - if ( send_buf==NULL ) { + if ( send_buf==NULL ) { send_buf = this->u_send_buf_p; - } - + } + // Find out if we get the direct copy. void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_send_buf_p); - if (success==NULL) { + if (success==NULL) { // we found a packet that comes from MPI and contributes to this leg of stencil shm_receive_only = 0; } @@ -1077,9 +1077,9 @@ public: assert(send_buf!=NULL); Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; gathertime+=usecond(); - + if ( compress.DecompressionStep() ) { - + if ( shm_receive_only ) { // Early decompress before MPI is finished is possible AddDecompress(&this->u_recv_buf_p[u_comm_offset], &recv_buf[u_comm_offset], @@ -1108,7 +1108,7 @@ public: } return shm_receive_only; } - + template int GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) { @@ -1136,7 +1136,7 @@ public: /////////////////////////////////////////////// int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // int words = sizeof(cobj)/sizeof(vector_type); - + assert(cbmask==0x3); // Fixme think there is a latent bug if not true // This assert will trap it if ever hit. Not hit normally so far int reduced_buffer_size = buffer_size; @@ -1152,22 +1152,22 @@ public: /////////////////////////////////////////// // Work out what to send where /////////////////////////////////////////// - + int cb = (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - + // loop over outer coord planes orthog to dim int shm_receive_only = 1; - for(int x=0;x= rd ); if ( any_offnode ) { - - for(int i=0;iShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - + + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + // shm == receive pointer if offnode // shm == Translate[send pointer] if on node -- my view of his send pointer cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp); - if (shm==NULL) { + if (shm==NULL) { shm = rp; // we found a packet that comes from MPI and contributes to this shift. // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil. @@ -1222,15 +1222,15 @@ public: AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); - - } else { - + + } else { + rpointers[i] = sp; - + } } - if ( shm_receive_only ) { + if ( shm_receive_only ) { AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM); } else { AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); @@ -1265,9 +1265,9 @@ public: shm_bytes = 0.; calls = 0.; }; - + void Report(void) { -#define AVERAGE(A) +#define AVERAGE(A) #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; RealD NN = _grid->NodeCount(); @@ -1284,7 +1284,7 @@ public: } } if (threaded) commtime += t; - + _grid->GlobalSum(commtime); commtime/=NP; if ( calls > 0. ) { std::cout << GridLogMessage << " Stencil calls "< arg) -> iScalar { iScalar ret; - ret._internal=Zero(); + zeroit(ret); for(int i=0;i struct GridTypeMapper > : public GridTypeMapper_Base { + typedef std::complex scalar_type; + typedef std::complex scalar_typeD; + typedef scalar_type vector_type; + typedef scalar_typeD vector_typeD; + typedef scalar_type tensor_reduced; + typedef scalar_type scalar_object; + typedef scalar_typeD scalar_objectD; + typedef scalar_type Complexified; + typedef RealF Realified; + typedef scalar_typeD DoublePrecision; + typedef scalar_typeD DoublePrecision2; + }; + template<> struct GridTypeMapper > : public GridTypeMapper_Base { + typedef std::complex scalar_type; + typedef std::complex scalar_typeD; + typedef scalar_type vector_type; + typedef scalar_typeD vector_typeD; + typedef scalar_type tensor_reduced; + typedef scalar_type scalar_object; + typedef scalar_typeD scalar_objectD; + typedef scalar_type Complexified; + typedef RealD Realified; + typedef scalar_typeD DoublePrecision; + typedef scalar_typeD DoublePrecision2; + }; +#endif + template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexD2 scalar_type; typedef ComplexD2 scalar_typeD; diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 2c4ad9df..2134d158 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -16,40 +16,54 @@ void acceleratorInit(void) char * localRankStr = NULL; int rank = 0, world_rank=0; #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" -#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" +#define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID" +#define ENV_RANK_SLURM "SLURM_PROCID" +#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" // We extract the local rank initialization using an environment variable - if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) - { + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { + printf("OPENMPI detected\n"); rank = atoi(localRankStr); - } - if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) - { + } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { + printf("MVAPICH detected\n"); rank = atoi(localRankStr); + } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) { + printf("SLURM detected\n"); + rank = atoi(localRankStr); + } else { + printf("MPI version is unknown - bad things may happen\n"); } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);} size_t totalDeviceMem=0; for (int i = 0; i < nDevices; i++) { -#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); +#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit[%d]: " #canMapHostMemory ": " FMT" \n",rank,prop.canMapHostMemory); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); cudaGetDeviceProperties(&gpu_props[i], i); cudaDeviceProp prop; prop = gpu_props[i]; totalDeviceMem = prop.totalGlobalMem; if ( world_rank == 0) { - printf("AcceleratorCudaInit: ========================\n"); - printf("AcceleratorCudaInit: Device Number : %d\n", i); - printf("AcceleratorCudaInit: ========================\n"); - printf("AcceleratorCudaInit: Device identifier: %s\n", prop.name); +#ifndef GRID_IBM_SUMMIT + if ( i==rank ) { + printf("AcceleratorCudaInit[%d]: ========================\n",rank); + printf("AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i); + printf("AcceleratorCudaInit[%d]: ========================\n",rank); + printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name); - GPU_PROP_FMT(totalGlobalMem,"%lld"); - GPU_PROP(managedMemory); - GPU_PROP(isMultiGpuBoard); - GPU_PROP(warpSize); + + GPU_PROP_FMT(totalGlobalMem,"%lld"); + GPU_PROP(managedMemory); + GPU_PROP(isMultiGpuBoard); + GPU_PROP(warpSize); + GPU_PROP(pciBusID); + GPU_PROP(pciDeviceID); + } +#endif // GPU_PROP(unifiedAddressing); // GPU_PROP(l2CacheSize); // GPU_PROP(singleToDoublePrecisionPerfRatio); @@ -61,9 +75,9 @@ void acceleratorInit(void) #ifdef GRID_IBM_SUMMIT // IBM Jsrun makes cuda Device numbering screwy and not match rank - if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - NOT setting device to node rank\n"); + if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - use default device\n"); #else - if ( world_rank == 0 ) printf("AcceleratorCudaInit: setting device to node rank\n"); + printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank); cudaSetDevice(rank); #endif if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n"); @@ -96,20 +110,24 @@ void acceleratorInit(void) if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + printf("world_rank %d has %d devices\n",world_rank,nDevices); + size_t totalDeviceMem=0; for (int i = 0; i < nDevices; i++) { #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); hipGetDeviceProperties(&gpu_props[i], i); + hipDeviceProp_t prop; + prop = gpu_props[i]; + totalDeviceMem = prop.totalGlobalMem; if ( world_rank == 0) { - hipDeviceProp_t prop; - prop = gpu_props[i]; printf("AcceleratorHipInit: ========================\n"); printf("AcceleratorHipInit: Device Number : %d\n", i); printf("AcceleratorHipInit: ========================\n"); printf("AcceleratorHipInit: Device identifier: %s\n", prop.name); + GPU_PROP_FMT(totalGlobalMem,"%lu"); // GPU_PROP(managedMemory); GPU_PROP(isMultiGpuBoard); GPU_PROP(warpSize); @@ -118,6 +136,7 @@ void acceleratorInit(void) // GPU_PROP(singleToDoublePrecisionPerfRatio); } } + MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours #undef GPU_PROP_FMT #undef GPU_PROP #ifdef GRID_IBM_SUMMIT diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 1e618e30..eb1cfb94 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -70,6 +70,7 @@ NAMESPACE_BEGIN(Grid); // // Memory management: // +// int acceleratorIsCommunicable(void *pointer); // void *acceleratorAllocShared(size_t bytes); // void acceleratorFreeShared(void *ptr); // @@ -90,6 +91,7 @@ void acceleratorInit(void); ////////////////////////////////////////////// #ifdef GRID_CUDA +#include #ifdef __CUDA_ARCH__ #define GRID_SIMT @@ -171,6 +173,16 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} +inline int acceleratorIsCommunicable(void *ptr) +{ + int uvm; + auto + cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr); + assert(cuerr == cudaSuccess ); + if(uvm) return 0; + else return 1; +} + #endif ////////////////////////////////////////////// @@ -225,6 +237,15 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} +inline int acceleratorIsCommunicable(void *ptr) +{ +#if 0 + auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); + if ( uvm = cl::sycl::usm::alloc::shared ) return 1; + else return 0; +#endif + return 1; +} #endif @@ -292,18 +313,15 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) inline void *acceleratorAllocShared(size_t bytes) { -#if 0 void *ptr=NULL; auto err = hipMallocManaged((void **)&ptr,bytes); if( err != hipSuccess ) { ptr = (void *) NULL; - printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err)); + printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); } return ptr; -#else - return malloc(bytes); -#endif }; +inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline void *acceleratorAllocDevice(size_t bytes) { @@ -311,7 +329,7 @@ inline void *acceleratorAllocDevice(size_t bytes) auto err = hipMalloc((void **)&ptr,bytes); if( err != hipSuccess ) { ptr = (void *) NULL; - printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err)); + printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); } return ptr; }; @@ -358,6 +376,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} +inline int acceleratorIsCommunicable(void *ptr){ return 1; } #ifdef HAVE_MM_MALLOC_H inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; diff --git a/Grid/util/Coordinate.h b/Grid/util/Coordinate.h index 7f1d31c0..004fbc72 100644 --- a/Grid/util/Coordinate.h +++ b/Grid/util/Coordinate.h @@ -99,10 +99,10 @@ inline std::ostream & operator<<(std::ostream &os, const AcceleratorVector 0) { - os << "\b"; + os << v[s]; + if( s < (v.size()-1) ){ + os << " "; + } } os << "]"; return os; diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index c0725345..d81fafb3 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -325,6 +325,13 @@ void Grid_init(int *argc,char ***argv) Grid_debug_handler_init(); } +#if defined(A64FX) + if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){ + std::cout << "Option --comms-overlap currently not supported on QPACE4. Exiting." << std::endl; + exit(EXIT_FAILURE); + } +#endif + ////////////////////////////////////////////////////////// // Memory manager ////////////////////////////////////////////////////////// @@ -377,9 +384,7 @@ void Grid_init(int *argc,char ***argv) std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "< still correct ? + +-------------------------------------------------------- + +* Fujitsu fcc + +../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" + + +* Fujitsu fcc w/ MPI + +../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 44ccbd19..232030c8 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -74,90 +74,6 @@ int main (int argc, char ** argv) std::vector t_time(Nloop); time_statistics timestat; - std::cout< > xbuf(8); - std::vector > rbuf(8); - - int ncomm; - int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); - for(int mu=0;mu<8;mu++){ - xbuf[mu].resize(lat*lat*lat*Ls); - rbuf[mu].resize(lat*lat*lat*Ls); - // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < requests; - - ncomm=0; - for(int mu=0;mu<4;mu++){ - - if (mpi_layout[mu]>1 ) { - - ncomm++; - int comm_proc=1; - int xmit_to_rank; - int recv_from_rank; - Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFromBegin(requests, - (void *)&xbuf[mu][0], - xmit_to_rank, - (void *)&rbuf[mu][0], - recv_from_rank, - bytes); - - comm_proc = mpi_layout[mu]-1; - - Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFromBegin(requests, - (void *)&xbuf[mu+4][0], - xmit_to_rank, - (void *)&rbuf[mu+4][0], - recv_from_rank, - bytes); - - } - } - Grid.SendToRecvFromComplete(requests); - Grid.Barrier(); - double stop=usecond(); - t_time[i] = stop-start; // microseconds - } - - timestat.statistics(t_time); - - double dbytes = bytes*ppn; - double xbytes = dbytes*2.0*ncomm; - double rbytes = xbytes; - double bidibytes = xbytes+rbytes; - - std::cout< requests; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFromBegin(requests, - (void *)&xbuf[mu][0], - xmit_to_rank, - (void *)&rbuf[mu][0], - recv_from_rank, - bytes); - Grid.SendToRecvFromComplete(requests); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); } comm_proc = mpi_layout[mu]-1; { std::vector requests; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFromBegin(requests, - (void *)&xbuf[mu+4][0], - xmit_to_rank, - (void *)&rbuf[mu+4][0], - recv_from_rank, - bytes); - Grid.SendToRecvFromComplete(requests); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); } } } diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index b5cbe42f..2ef5921d 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -1,5 +1,5 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./benchmarks/Benchmark_dwf.cc Copyright (C) 2015 @@ -77,7 +77,7 @@ int main (int argc, char ** argv) std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); - + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; @@ -107,8 +107,8 @@ int main (int argc, char ** argv) LatticeFermion err(FGrid); std::cout << GridLogMessage << "Drawing gauge field" << std::endl; - LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); + SU3::HotConfiguration(RNG4,Umu); std::cout << GridLogMessage << "Random gauge initialised " << std::endl; #if 0 Umu=1.0; @@ -126,7 +126,7 @@ int main (int argc, char ** argv) // Naive wilson implementation //////////////////////////////////// // replicate across fifth dimension - LatticeGaugeField Umu5d(FGrid); + LatticeGaugeField Umu5d(FGrid); std::vector U(4,FGrid); { autoView( Umu5d_v, Umu5d, CpuWrite); @@ -197,24 +197,38 @@ int main (int argc, char ** argv) } double t1=usecond(); FGrid->Barrier(); - + double volume=Ls; for(int mu=0;mu1.0e-4) ) { + if(( norm2(err)>1.0e-4) ) { + /* std::cout << "RESULT\n " << result<Barrier(); exit(-1); } @@ -235,7 +249,7 @@ int main (int argc, char ** argv) } double t1=usecond(); FGrid->Barrier(); - + double volume=Ls; for(int mu=0;mu1.0e-4)){ +/* std::cout<< "DAG RESULT\n " <Barrier(); - + double volume=Ls; for(int mu=0;mu1.0e-4)){ + /* std::cout<< "Deo RESULT\n " <({45,12,81,9})); + + LatticeColourMatrix z(&Grid); random(pRNG,z); + LatticeColourMatrix x(&Grid); random(pRNG,x); + LatticeColourMatrix y(&Grid); random(pRNG,y); + + for(int64_t i=0;i(Umu,mu); } - + { // Naive wilson implementation ref = Zero(); for(int mu=0;mu, array, std::vector >, twodimarray, - std::vector > >, cmplx3darray, + std::vector> > >, cmplx3darray, SpinColourMatrix, scm ); myclass() {} myclass(int i) : array(4,5.1) , twodimarray(3,std::vector(5, 1.23456)) - , cmplx3darray(3,std::vector>(5, std::vector(7, Complex(1.2, 3.4)))) + , cmplx3darray(3,std::vector>>(5, std::vector>(7, std::complex(1.2, 3.4)))) , ve(2, myenum::blue) { e=myenum::red; @@ -121,11 +121,11 @@ namespace Eigen { // Perform I/O tests on a range of tensor types // Test coverage: scalars, complex and GridVectors in single, double and default precision class TensorIO : public Serializable { - using TestScalar = ComplexD; + using TestScalar = std::complex; using SR3 = Eigen::Sizes<9,4,2>; using SR5 = Eigen::Sizes<5,4,3,2,1>; using ESO = Eigen::StorageOptions; - using TensorRank3 = Eigen::Tensor; + using TensorRank3 = Eigen::Tensor, 3, ESO::RowMajor>; using TensorR5 = Eigen::TensorFixedSize; using TensorR5Alt = Eigen::TensorFixedSize; using Tensor942 = Eigen::TensorFixedSize; @@ -134,8 +134,8 @@ class TensorIO : public Serializable { using LSCTensor = Eigen::TensorFixedSize>; static const Real FlagR; - static const Complex Flag; - static const ComplexF FlagF; + static const std::complex Flag; + static const std::complex FlagF; static const TestScalar FlagTS; static const char * const pszFilePrefix; @@ -230,8 +230,8 @@ public: }; const Real TensorIO::FlagR {1}; -const Complex TensorIO::Flag {1,-1}; -const ComplexF TensorIO::FlagF {1,-1}; +const std::complex TensorIO::Flag {1,-1}; +const std::complex TensorIO::FlagF {1,-1}; const TensorIO::TestScalar TensorIO::FlagTS{1,-1}; const char * const TensorIO::pszFilePrefix = "tensor_"; diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 9b0fa02b..468bc982 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -101,14 +101,14 @@ public: // FIXME still to test: // // innerProduct, -// norm2, +// norm2, // Reduce, // // mac,mult,sub,add, vone,vzero,vcomplex_i, =Zero(), // vset,vsplat,vstore,vstream,vload, scalar*vec, vec*scalar // unary -, // *= , -=, += -// outerproduct, +// outerproduct, // zeroit // permute class funcReduce { @@ -119,12 +119,12 @@ template void sfunc(reduce &rr,scal &i1,scal &i2) con std::string name(void) const { return std::string("Reduce"); } }; -template +template void Tester(const functor &func) { GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - + int Nsimd = vec::Nsimd(); ExtractBuffer input1(Nsimd); @@ -172,6 +172,8 @@ void Tester(const functor &func) } if ( ok==0 ) { std::cout< +template void ReductionTester(const functor &func) { GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - + int Nsimd = vec::Nsimd(); ExtractBuffer input1(Nsimd); @@ -278,12 +282,14 @@ void ReductionTester(const functor &func) } if ( ok==0 ) { std::cout< +template void IntReductionTester(const functor &func) { int Nsimd = vec::Nsimd(); @@ -323,6 +329,8 @@ void IntReductionTester(const functor &func) } if ( ok==0 ) { std::cout< void operator()(vec &rr,vec &i1,vec &i2) const { permute(rr,i1,n);} - template void apply(ExtractBuffer &rr,ExtractBuffer &in) const { + template void apply(ExtractBuffer &rr,ExtractBuffer &in) const { int sz=in.size(); int msk = sz>>(n+1); for(int i=0;i void apply(ExtractBuffer &r1, ExtractBuffer &r2, ExtractBuffer &in1, - ExtractBuffer &in2) const - { + ExtractBuffer &in2) const + { int sz=in1.size(); int msk = sz>>(n+1); @@ -364,7 +372,7 @@ public: if ( (i&msk) == 0 ) { r2[i]=in1[j2];} else { r2[i]=in2[j2];} - } + } } std::string name(void) const { return std::string("Exchange"); } }; @@ -374,7 +382,7 @@ public: int n; funcRotate(int _n) { n=_n;}; template void operator()(vec &rr,vec &i1,vec &i2) const { rr=rotate(i1,n);} - template void apply(ExtractBuffer &rr,ExtractBuffer &in) const { + template void apply(ExtractBuffer &rr,ExtractBuffer &in) const { int sz = in.size(); for(int i=0;i +template void PermTester(const functor &func) { GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - + int Nsimd = vec::Nsimd(); ExtractBuffer input1(Nsimd); @@ -425,37 +433,39 @@ void PermTester(const functor &func) for(int i=0;i1.0e-7){ - std::cout< +template void ExchangeTester(const functor &func) { GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - + int Nsimd = vec::Nsimd(); ExtractBuffer input1(Nsimd); @@ -566,7 +576,7 @@ int main (int argc, char ** argv) std::cout << " Test {1,2,3,4} " << Test < seeds({1,2,3,4}); @@ -742,7 +752,7 @@ int main (int argc, char ** argv) for(int r=0;r(funcRotate(r)); } - + std::cout<oSites();i++){ + { + autoView( check , Check, AcceleratorWrite); + autoView( foo , Foo, AcceleratorRead); + autoView(st_v ,myStencil,AcceleratorRead); + auto CBp=myStencil.CommBuf(); + accelerator_for(i,Check.Grid()->oSites(), 1, { - int permute_type; - StencilEntry *SE; - SE = myStencil.GetEntry(permute_type,0,i); - - autoView( check , Check, CpuWrite); - autoView( foo , Foo, CpuRead); - if ( SE->_is_local && SE->_permute ) - permute(check[i],foo[SE->_offset],permute_type); - else if (SE->_is_local) - check[i] = foo[SE->_offset]; - else { - check[i] = myStencil.CommBuf()[SE->_offset]; - // std::cout << " receive "<_is_local && SE->_permute ) + permute(check[i],foo[SE->_offset],permute_type); + else if (SE->_is_local) + check[i] = foo[SE->_offset]; + else { + check[i] = CBp[SE->_offset]; + } + }); + } Real nrmC = norm2(Check); Real nrmB = norm2(Bar); @@ -204,36 +206,42 @@ int main(int argc, char ** argv) { // Implement a stencil code that should agree with that darn cshift! EStencil.HaloExchange(EFoo,compress); - for(int i=0;ioSites();i++){ - int permute_type; - StencilEntry *SE; - SE = EStencil.GetEntry(permute_type,0,i); - // std::cout << "Even source "<< i<<" -> " <_offset << " "<< SE->_is_local<oSites(),1,{ + int permute_type; + StencilEntry *SE; + SE = Est.GetEntry(permute_type,0,i); - autoView( ocheck , OCheck, CpuWrite); - autoView( efoo , EFoo, CpuRead); - if ( SE->_is_local && SE->_permute ) - permute(ocheck[i],efoo[SE->_offset],permute_type); - else if (SE->_is_local) - ocheck[i] = efoo[SE->_offset]; - else - ocheck[i] = EStencil.CommBuf()[SE->_offset]; + if ( SE->_is_local && SE->_permute ) + permute(ocheck[i],efoo[SE->_offset],permute_type); + else if (SE->_is_local) + ocheck[i] = efoo[SE->_offset]; + else + ocheck[i] = ECBp[SE->_offset]; + }); } OStencil.HaloExchange(OFoo,compress); - for(int i=0;ioSites();i++){ - int permute_type; - StencilEntry *SE; - SE = OStencil.GetEntry(permute_type,0,i); - // std::cout << "ODD source "<< i<<" -> " <_offset << " "<< SE->_is_local<oSites(),1,{ + int permute_type; + StencilEntry *SE; + SE = Ost.GetEntry(permute_type,0,i); - autoView( echeck , ECheck, CpuWrite); - autoView( ofoo , OFoo, CpuRead); - if ( SE->_is_local && SE->_permute ) - permute(echeck[i],ofoo[SE->_offset],permute_type); - else if (SE->_is_local) - echeck[i] = ofoo[SE->_offset]; - else - echeck[i] = OStencil.CommBuf()[SE->_offset]; + if ( SE->_is_local && SE->_permute ) + permute(echeck[i],ofoo[SE->_offset],permute_type); + else if (SE->_is_local) + echeck[i] = ofoo[SE->_offset]; + else + echeck[i] = OCBp[SE->_offset]; + }); } setCheckerboard(Check,ECheck); diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc index 08752a46..af8b747b 100644 --- a/tests/core/Test_main.cc +++ b/tests/core/Test_main.cc @@ -137,7 +137,6 @@ int main(int argc, char **argv) { LatticeReal iscalar(&Fine); SpinMatrix GammaFive; - iSpinMatrix iGammaFive; ColourMatrix cmat; random(FineRNG, Foo); @@ -283,7 +282,6 @@ int main(int argc, char **argv) { cMat = mydouble * cMat; sMat = adj(sMat); // LatticeSpinMatrix adjoint - sMat = iGammaFive * sMat; // SpinMatrix * LatticeSpinMatrix sMat = GammaFive * sMat; // SpinMatrix * LatticeSpinMatrix scMat = adj(scMat); cMat = adj(cMat); diff --git a/tests/core/Test_staggered_naive.cc b/tests/core/Test_staggered_naive.cc index f96bac93..9fe35a54 100644 --- a/tests/core/Test_staggered_naive.cc +++ b/tests/core/Test_staggered_naive.cc @@ -261,11 +261,11 @@ int main (int argc, char ** argv) pickCheckerboard(Odd ,phi_o,phi); SchurDiagMooeeOperator HermOpEO(Ds); - HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); - HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + HermOpEO.MpcDagMpc(chi_e,dchi_e); + HermOpEO.MpcDagMpc(chi_o,dchi_o); - HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); - HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + HermOpEO.MpcDagMpc(phi_e,dphi_e); + HermOpEO.MpcDagMpc(phi_o,dphi_o); pDce = innerProduct(phi_e,dchi_e); pDco = innerProduct(phi_o,dchi_o); diff --git a/tests/core/Test_where.cc b/tests/core/Test_where.cc new file mode 100644 index 00000000..050b711b --- /dev/null +++ b/tests/core/Test_where.cc @@ -0,0 +1,80 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_poisson_fft.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< latt_size ({N,4,4}); + std::vector simd_layout({vComplexD::Nsimd(),1,1}); + std::vector mpi_layout ({1,1,1}); + + int vol = 1; + int nd = latt_size.size(); + for(int d=0;d({45,12,81,9})); + gaussian(RNG,rn); + + RealD nn=norm2(rn); + for(int mu=0;mu Subspace; Subspace Aggregates(Coarse5d,FGrid,cb); - Aggregates.CreateSubspaceRandom(RNG5); + // Aggregates.CreateSubspaceRandom(RNG5); subspace=Aggregates.subspace; @@ -163,7 +163,7 @@ int main (int argc, char ** argv) LittleDiracOp.M(c_src,c_res); std::cout<= Nm1); - const int nbasis= 70; + const int nbasis= 32; CoarseFineIRL IRL(FrbGrid,CoarseGrid5rb,HermOp,Odd); std::cout << GridLogMessage << "Constructed CoarseFine IRL" << std::endl; diff --git a/tests/solver/Test_dwf_hdcr_2level.cc b/tests/solver/Test_dwf_hdcr_2level.cc new file mode 100644 index 00000000..df24c9d2 --- /dev/null +++ b/tests/solver/Test_dwf_hdcr_2level.cc @@ -0,0 +1,420 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_hdcr.cc + + Copyright (C) 2015 + +Author: Antonin Portelli +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; +template class RedBlackSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + RealD tol; + RealD shift; + int maxit; + + RedBlackSmoother(RealD _shift,RealD _tol,int _maxit,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + std::cout << " Red Black Smootheer "< CG(tol,maxit,false); + out =Zero(); + SchurRedBlackDiagMooeeSolve RBSolver(CG); + RBSolver(SmootherMatrix,in,out); + std::cout << " Red Black Smootheer "< +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother1; + FineSmoother & _Smoother2; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + const int nbasis= 32; + + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("./ckpoint_lat"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,100,0.0);// 18s + // rAggregates.CreateSubspaceChebyshev(RNG5,rHermDefOp,nb,60.0,0.05,500,200,150,0.0);// 15.7 23iter + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,150,0.0);// + // pad out the rAggregates. + + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,500,150,0.0);// 19s + + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,200,0.0); 15.2s + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,500,200,0.0); 16.3s + + for(int n=0;n Level1Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); + + std::cout< IRLHermOp(LDOp); + Chebyshev IRLCheby(0.002,12.,151); + FunctionHermOp IRLOpCheby(IRLCheby,IRLHermOp); + PlainHermOp IRLOp (IRLHermOp); + int Nk=48; + int Nm=64; + int Nstop=48; + int Nconv; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); + + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + CoarseVector c_src(Coarse5d); + gaussian(CRNG,c_src); + IRL.calc(eval,evec,c_src,Nconv); + + // ConjugateGradient CoarseCG(0.01,1000); + + ConjugateGradient CoarseCG(0.02,1000);// 14.7s + DeflatedGuesser DeflCoarseGuesser(evec,eval); + NormalEquations DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser); + + c_src=1.0; + std::cout< PM; PM(HermDefOp,src); + std::cout< PosdefLdop(LDOp); + PowerMethod cPM; cPM(PosdefLdop,c_src); + + std::cout< , NormalEquations > TwoLevelMG; + + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + // ChebyshevSmoother FineSmoother(0.5,60.0,14,HermIndefOp,Ddwf); // 72 iter 63s + // ChebyshevSmoother FineSmoother(0.1,60.0,20,HermIndefOp,Ddwf); // 66 iter 69s + // ChebyshevSmoother FineSmoother(0.5,60.0,20,HermIndefOp,Ddwf); // 63 iter 65 s + // ChebyshevSmoother FineSmoother(1.0,60.0,20,HermIndefOp,Ddwf); // 69, 70 + // ChebyshevSmoother FineSmoother(1.0,60.0,14,HermIndefOp,Ddwf); // 77 + + // ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); // 23 iter 15.9s + // ChebyshevSmoother FineSmoother(0.5,60.0,14,HermIndefOp,Ddwf); // 20, 16.9s + ChebyshevSmoother FineSmoother(0.5,60.0,12,HermIndefOp,Ddwf); // 21, 15.6s + + // MirsSmoother FineCGSmoother(0.05,0.01,20,HermIndefOp,Ddwf); + // RedBlackSmoother FineRBSmoother(0.00,0.001,100,Ddwf); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + ZeroGuesser CoarseZeroGuesser; + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + DeflCoarseGuesser, + DeflCoarseCGNE); + TwoLevelPrecon.Level(1); + + // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,1000,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + + std::cout< FineCG(1.0e-8,10000); + SchurDiagMooeeOperator FineDiagMooee(Ddwf); // M_ee - Meo Moo^-1 Moe + LatticeFermion f_src_e(FrbGrid); f_src_e=1.0; + LatticeFermion f_res_e(FrbGrid); f_res_e=Zero(); + FineCG(FineDiagMooee,f_src_e,f_res_e); + + std::cout<