mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
ARM neon intrinsics support
This commit is contained in:
parent
4a8c4ccfba
commit
3d04dc33c6
@ -244,6 +244,9 @@ case ${ax_cv_cxx_compiler_vendor} in
|
|||||||
[generic SIMD vector width (in bytes)])
|
[generic SIMD vector width (in bytes)])
|
||||||
SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
|
SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
|
||||||
SIMD_FLAGS='';;
|
SIMD_FLAGS='';;
|
||||||
|
NEONv8)
|
||||||
|
AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
|
||||||
|
SIMD_FLAGS='';;
|
||||||
QPX|BGQ)
|
QPX|BGQ)
|
||||||
AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
|
AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
|
||||||
SIMD_FLAGS='';;
|
SIMD_FLAGS='';;
|
||||||
|
@ -26,7 +26,7 @@ Author: Antonin Portelli <antonin.portelli@me.com>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
|
#define GEN_SIMD_WIDTH 16
|
||||||
static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
|
static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
|
||||||
|
|
||||||
//#define VECTOR_LOOPS
|
//#define VECTOR_LOOPS
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
Source file: ./lib/simd/Grid_neon.h
|
Source file: ./lib/simd/Grid_neon.h
|
||||||
|
|
||||||
Copyright (C) 2015
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de>
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
Author: neo <cossu@post.kek.jp>
|
Author: neo <cossu@post.kek.jp>
|
||||||
|
|
||||||
@ -27,18 +28,23 @@ Author: neo <cossu@post.kek.jp>
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
/*! @file Grid_sse4.h
|
/*
|
||||||
@brief Optimization libraries for NEON (ARM) instructions set ARMv8
|
|
||||||
|
ARMv8 NEON intrinsics layer by
|
||||||
|
|
||||||
|
Nils Meyer <nils.meyer@ur.de>,
|
||||||
|
University of Regensburg, Germany
|
||||||
|
SFB/TRR55
|
||||||
|
|
||||||
Experimental - Using intrinsics - DEVELOPING!
|
|
||||||
*/
|
*/
|
||||||
// Time-stamp: <2015-07-10 17:45:09 neo>
|
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
|
//#ifndef ARM_NEON
|
||||||
|
//#define ARM_NEON
|
||||||
|
|
||||||
|
#include "Grid_generic_types.h"
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
// ARMv8 supports double precision
|
namespace Grid {
|
||||||
|
|
||||||
namespace Optimization {
|
namespace Optimization {
|
||||||
|
|
||||||
template<class vtype>
|
template<class vtype>
|
||||||
@ -46,16 +52,20 @@ namespace Optimization {
|
|||||||
float32x4_t f;
|
float32x4_t f;
|
||||||
vtype v;
|
vtype v;
|
||||||
};
|
};
|
||||||
|
|
||||||
union u128f {
|
union u128f {
|
||||||
float32x4_t v;
|
float32x4_t v;
|
||||||
float f[4];
|
float f[4];
|
||||||
};
|
};
|
||||||
union u128d {
|
union u128d {
|
||||||
float64x2_t v;
|
float64x2_t v;
|
||||||
double f[4];
|
double f[2];
|
||||||
};
|
};
|
||||||
|
// half precision
|
||||||
|
union u128h {
|
||||||
|
float16x8_t v;
|
||||||
|
uint16_t f[8];
|
||||||
|
};
|
||||||
|
|
||||||
struct Vsplat{
|
struct Vsplat{
|
||||||
//Complex float
|
//Complex float
|
||||||
inline float32x4_t operator()(float a, float b){
|
inline float32x4_t operator()(float a, float b){
|
||||||
@ -64,31 +74,31 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
// Real float
|
// Real float
|
||||||
inline float32x4_t operator()(float a){
|
inline float32x4_t operator()(float a){
|
||||||
return vld1q_dup_f32(&a);
|
return vdupq_n_f32(a);
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
inline float32x4_t operator()(double a, double b){
|
inline float64x2_t operator()(double a, double b){
|
||||||
float tmp[4]={(float)a,(float)b,(float)a,(float)b};
|
double tmp[2]={a,b};
|
||||||
return vld1q_f32(tmp);
|
return vld1q_f64(tmp);
|
||||||
}
|
}
|
||||||
//Real double
|
//Real double // N:tbc
|
||||||
inline float32x4_t operator()(double a){
|
inline float64x2_t operator()(double a){
|
||||||
return vld1q_dup_f32(&a);
|
return vdupq_n_f64(a);
|
||||||
}
|
}
|
||||||
//Integer
|
//Integer // N:tbc
|
||||||
inline uint32x4_t operator()(Integer a){
|
inline uint32x4_t operator()(Integer a){
|
||||||
return vld1q_dup_u32(&a);
|
return vdupq_n_u32(a);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Vstore{
|
struct Vstore{
|
||||||
//Float
|
//Float
|
||||||
inline void operator()(float32x4_t a, float* F){
|
inline void operator()(float32x4_t a, float* F){
|
||||||
vst1q_f32(F, a);
|
vst1q_f32(F, a);
|
||||||
}
|
}
|
||||||
//Double
|
//Double
|
||||||
inline void operator()(float32x4_t a, double* D){
|
inline void operator()(float64x2_t a, double* D){
|
||||||
vst1q_f32((float*)D, a);
|
vst1q_f64(D, a);
|
||||||
}
|
}
|
||||||
//Integer
|
//Integer
|
||||||
inline void operator()(uint32x4_t a, Integer* I){
|
inline void operator()(uint32x4_t a, Integer* I){
|
||||||
@ -97,54 +107,54 @@ namespace Optimization {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Vstream{
|
struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
|
||||||
//Float
|
//Float // N:generic
|
||||||
inline void operator()(float * a, float32x4_t b){
|
inline void operator()(float * a, float32x4_t b){
|
||||||
|
memcpy(a,&b,4*sizeof(float));
|
||||||
}
|
}
|
||||||
//Double
|
//Double // N:generic
|
||||||
inline void operator()(double * a, float32x4_t b){
|
inline void operator()(double * a, float64x2_t b){
|
||||||
|
memcpy(a,&b,2*sizeof(double));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Nils: Vset untested; not used currently in Grid at all;
|
||||||
|
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
|
||||||
struct Vset{
|
struct Vset{
|
||||||
// Complex float
|
// Complex float // N:ok
|
||||||
inline float32x4_t operator()(Grid::ComplexF *a){
|
inline float32x4_t operator()(Grid::ComplexF *a){
|
||||||
float32x4_t foo;
|
float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
|
||||||
return foo;
|
return vld1q_f32(tmp);
|
||||||
}
|
}
|
||||||
// Complex double
|
// Complex double // N:ok
|
||||||
inline float32x4_t operator()(Grid::ComplexD *a){
|
inline float64x2_t operator()(Grid::ComplexD *a){
|
||||||
float32x4_t foo;
|
double tmp[2]={a[0].imag(),a[0].real()};
|
||||||
return foo;
|
return vld1q_f64(tmp);
|
||||||
}
|
}
|
||||||
// Real float
|
// Real float // N:ok
|
||||||
inline float32x4_t operator()(float *a){
|
inline float32x4_t operator()(float *a){
|
||||||
float32x4_t foo;
|
float tmp[4]={a[3],a[2],a[1],a[0]};
|
||||||
return foo;
|
return vld1q_f32(tmp);
|
||||||
}
|
}
|
||||||
// Real double
|
// Real double // N:ok
|
||||||
inline float32x4_t operator()(double *a){
|
inline float64x2_t operator()(double *a){
|
||||||
float32x4_t foo;
|
double tmp[2]={a[1],a[0]};
|
||||||
return foo;
|
return vld1q_f64(tmp);
|
||||||
}
|
}
|
||||||
// Integer
|
// Integer // N:ok
|
||||||
inline uint32x4_t operator()(Integer *a){
|
inline uint32x4_t operator()(Integer *a){
|
||||||
uint32x4_t foo;
|
return vld1q_dup_u32(a);
|
||||||
return foo;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// N:leaving as is
|
||||||
template <typename Out_type, typename In_type>
|
template <typename Out_type, typename In_type>
|
||||||
struct Reduce{
|
struct Reduce{
|
||||||
//Need templated class to overload output type
|
//Need templated class to overload output type
|
||||||
//General form must generate error if compiled
|
//General form must generate error if compiled
|
||||||
inline Out_type operator()(In_type in){
|
inline Out_type operator()(In_type in){
|
||||||
printf("Error, using wrong Reduce function\n");
|
printf("Error, using wrong Reduce function\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
return 0;
|
return 0;
|
||||||
@ -184,26 +194,98 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct MultRealPart{
|
||||||
|
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||||
|
float32x4_t re = vtrn1q_f32(a, a);
|
||||||
|
return vmulq_f32(re, b);
|
||||||
|
}
|
||||||
|
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||||
|
float64x2_t re = vzip1q_f64(a, a);
|
||||||
|
return vmulq_f64(re, b);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct MaddRealPart{
|
||||||
|
inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
|
||||||
|
float32x4_t re = vtrn1q_f32(a, a);
|
||||||
|
return vfmaq_f32(c, re, b);
|
||||||
|
}
|
||||||
|
inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
|
||||||
|
float64x2_t re = vzip1q_f64(a, a);
|
||||||
|
return vfmaq_f64(c, re, b);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Div{
|
||||||
|
// Real float
|
||||||
|
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||||
|
return vdivq_f32(a, b);
|
||||||
|
}
|
||||||
|
// Real double
|
||||||
|
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||||
|
return vdivq_f64(a, b);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct MultComplex{
|
struct MultComplex{
|
||||||
// Complex float
|
// Complex float
|
||||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||||
float32x4_t foo;
|
|
||||||
return foo;
|
float32x4_t r0, r1, r2, r3, r4;
|
||||||
|
|
||||||
|
// a = ar ai Ar Ai
|
||||||
|
// b = br bi Br Bi
|
||||||
|
// collect real/imag part, negate bi and Bi
|
||||||
|
r0 = vtrn1q_f32(b, b); // br br Br Br
|
||||||
|
r1 = vnegq_f32(b); // -br -bi -Br -Bi
|
||||||
|
r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi
|
||||||
|
|
||||||
|
// the fun part
|
||||||
|
r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ...
|
||||||
|
r4 = vrev64q_f32(r3); // -bi*ai bi*ar ...
|
||||||
|
|
||||||
|
// fma(a,b,c) = a+b*c
|
||||||
|
return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ...
|
||||||
|
|
||||||
|
// no fma, use mul and add
|
||||||
|
//float32x4_t r5;
|
||||||
|
//r5 = vmulq_f32(r0, a);
|
||||||
|
//return vaddq_f32(r4, r5);
|
||||||
}
|
}
|
||||||
// Complex double
|
// Complex double
|
||||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||||
float32x4_t foo;
|
|
||||||
return foo;
|
float64x2_t r0, r1, r2, r3, r4;
|
||||||
|
|
||||||
|
// b = br bi
|
||||||
|
// collect real/imag part, negate bi
|
||||||
|
r0 = vtrn1q_f64(b, b); // br br
|
||||||
|
r1 = vnegq_f64(b); // -br -bi
|
||||||
|
r2 = vtrn2q_f64(b, r1); // bi -bi
|
||||||
|
|
||||||
|
// the fun part
|
||||||
|
r3 = vmulq_f64(r2, a); // bi*ar -bi*ai
|
||||||
|
r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar
|
||||||
|
|
||||||
|
// fma(a,b,c) = a+b*c
|
||||||
|
return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi
|
||||||
|
|
||||||
|
// no fma, use mul and add
|
||||||
|
//float64x2_t r5;
|
||||||
|
//r5 = vmulq_f64(r0, a);
|
||||||
|
//return vaddq_f64(r4, r5);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Mult{
|
struct Mult{
|
||||||
// Real float
|
// Real float
|
||||||
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
|
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
|
||||||
return vaddq_f32(vmulq_f32(b,c),a);
|
//return vaddq_f32(vmulq_f32(b,c),a);
|
||||||
|
return vfmaq_f32(a, b, c);
|
||||||
}
|
}
|
||||||
inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
|
inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
|
||||||
return vaddq_f64(vmulq_f64(b,c),a);
|
//return vaddq_f64(vmulq_f64(b,c),a);
|
||||||
|
return vfmaq_f64(a, b, c);
|
||||||
}
|
}
|
||||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||||
return vmulq_f32(a,b);
|
return vmulq_f32(a,b);
|
||||||
@ -221,89 +303,275 @@ namespace Optimization {
|
|||||||
struct Conj{
|
struct Conj{
|
||||||
// Complex single
|
// Complex single
|
||||||
inline float32x4_t operator()(float32x4_t in){
|
inline float32x4_t operator()(float32x4_t in){
|
||||||
return in;
|
// ar ai br bi -> ar -ai br -bi
|
||||||
|
float32x4_t r0, r1;
|
||||||
|
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||||
|
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
|
||||||
|
return vtrn1q_f32(in, r1); // ar -ai br -bi
|
||||||
}
|
}
|
||||||
// Complex double
|
// Complex double
|
||||||
//inline float32x4_t operator()(float32x4_t in){
|
inline float64x2_t operator()(float64x2_t in){
|
||||||
// return 0;
|
|
||||||
//}
|
float64x2_t r0, r1;
|
||||||
|
r0 = vextq_f64(in, in, 1); // ai ar
|
||||||
|
r1 = vnegq_f64(r0); // -ai -ar
|
||||||
|
return vextq_f64(r0, r1, 1); // ar -ai
|
||||||
|
}
|
||||||
// do not define for integer input
|
// do not define for integer input
|
||||||
};
|
};
|
||||||
|
|
||||||
struct TimesMinusI{
|
struct TimesMinusI{
|
||||||
//Complex single
|
//Complex single
|
||||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||||
return in;
|
// ar ai br bi -> ai -ar ai -br
|
||||||
|
float32x4_t r0, r1;
|
||||||
|
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||||
|
r1 = vrev64q_f32(in); // ai ar bi br
|
||||||
|
return vtrn1q_f32(r1, r0); // ar -ai br -bi
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
//inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
|
||||||
// return in;
|
// a ib -> b -ia
|
||||||
//}
|
float64x2_t tmp;
|
||||||
|
tmp = vnegq_f64(in);
|
||||||
|
return vextq_f64(in, tmp, 1);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct TimesI{
|
struct TimesI{
|
||||||
//Complex single
|
//Complex single
|
||||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||||
//need shuffle
|
// ar ai br bi -> -ai ar -bi br
|
||||||
return in;
|
float32x4_t r0, r1;
|
||||||
|
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||||
|
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
|
||||||
|
return vtrn1q_f32(r1, in); // -ai ar -bi br
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
//inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
|
||||||
// return 0;
|
// a ib -> -b ia
|
||||||
//}
|
float64x2_t tmp;
|
||||||
|
tmp = vnegq_f64(in);
|
||||||
|
return vextq_f64(tmp, in, 1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Permute{
|
||||||
|
|
||||||
|
static inline float32x4_t Permute0(float32x4_t in){ // N:ok
|
||||||
|
// AB CD -> CD AB
|
||||||
|
return vextq_f32(in, in, 2);
|
||||||
|
};
|
||||||
|
static inline float32x4_t Permute1(float32x4_t in){ // N:ok
|
||||||
|
// AB CD -> BA DC
|
||||||
|
return vrev64q_f32(in);
|
||||||
|
};
|
||||||
|
static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline float64x2_t Permute0(float64x2_t in){ // N:ok
|
||||||
|
// AB -> BA
|
||||||
|
return vextq_f64(in, in, 1);
|
||||||
|
};
|
||||||
|
static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Rotate{
|
||||||
|
|
||||||
|
static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
|
||||||
|
switch(n){
|
||||||
|
case 0: // AB CD -> AB CD
|
||||||
|
return tRotate<0>(in);
|
||||||
|
break;
|
||||||
|
case 1: // AB CD -> BC DA
|
||||||
|
return tRotate<1>(in);
|
||||||
|
break;
|
||||||
|
case 2: // AB CD -> CD AB
|
||||||
|
return tRotate<2>(in);
|
||||||
|
break;
|
||||||
|
case 3: // AB CD -> DA BC
|
||||||
|
return tRotate<3>(in);
|
||||||
|
break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
|
||||||
|
switch(n){
|
||||||
|
case 0: // AB -> AB
|
||||||
|
return tRotate<0>(in);
|
||||||
|
break;
|
||||||
|
case 1: // AB -> BA
|
||||||
|
return tRotate<1>(in);
|
||||||
|
break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// working, but no restriction on n
|
||||||
|
// template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
|
||||||
|
// template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
|
||||||
|
|
||||||
|
// restriction on n
|
||||||
|
template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
|
||||||
|
template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
struct PrecisionChange {
|
||||||
|
|
||||||
|
static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
|
||||||
|
float16x4_t h = vcvt_f16_f32(a);
|
||||||
|
return vcvt_high_f16_f32(h, b);
|
||||||
|
}
|
||||||
|
static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
|
||||||
|
sb = vcvt_high_f32_f16(h);
|
||||||
|
// there is no direct conversion from lower float32x4_t to float64x2_t
|
||||||
|
// vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
|
||||||
|
//float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
|
||||||
|
// workaround for clang
|
||||||
|
uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
|
||||||
|
float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
|
||||||
|
sa = vcvt_high_f32_f16(h1);
|
||||||
|
}
|
||||||
|
static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
|
||||||
|
float32x2_t s = vcvt_f32_f64(a);
|
||||||
|
return vcvt_high_f32_f64(s, b);
|
||||||
|
|
||||||
|
}
|
||||||
|
static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
|
||||||
|
b = vcvt_high_f64_f32(s);
|
||||||
|
// there is no direct conversion from lower float32x4_t to float64x2_t
|
||||||
|
float32x4_t s1 = vextq_f32(s, s, 2);
|
||||||
|
a = vcvt_high_f64_f32(s1);
|
||||||
|
|
||||||
|
}
|
||||||
|
static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
|
||||||
|
float32x4_t s1 = DtoS(a, b);
|
||||||
|
float32x4_t s2 = DtoS(c, d);
|
||||||
|
return StoH(s1, s2);
|
||||||
|
}
|
||||||
|
static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
|
||||||
|
float32x4_t s1, s2;
|
||||||
|
HtoS(h, s1, s2);
|
||||||
|
StoD(s1, a, b);
|
||||||
|
StoD(s2, c, d);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
// Exchange support
|
||||||
|
|
||||||
|
struct Exchange{
|
||||||
|
static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||||
|
// in1: ABCD -> out1: ABEF
|
||||||
|
// in2: EFGH -> out2: CDGH
|
||||||
|
|
||||||
|
// z: CDAB
|
||||||
|
float32x4_t z = vextq_f32(in1, in1, 2);
|
||||||
|
// out1: ABEF
|
||||||
|
out1 = vextq_f32(z, in2, 2);
|
||||||
|
|
||||||
|
// z: GHEF
|
||||||
|
z = vextq_f32(in2, in2, 2);
|
||||||
|
// out2: CDGH
|
||||||
|
out2 = vextq_f32(in1, z, 2);
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||||
|
// in1: ABCD -> out1: AECG
|
||||||
|
// in2: EFGH -> out2: BFDH
|
||||||
|
out1 = vtrn1q_f32(in1, in2);
|
||||||
|
out2 = vtrn2q_f32(in1, in2);
|
||||||
|
};
|
||||||
|
static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||||
|
assert(0);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||||
|
assert(0);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
// double precision
|
||||||
|
static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
|
||||||
|
// in1: AB -> out1: AC
|
||||||
|
// in2: CD -> out2: BD
|
||||||
|
out1 = vzip1q_f64(in1, in2);
|
||||||
|
out2 = vzip2q_f64(in1, in2);
|
||||||
|
};
|
||||||
|
static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
|
||||||
|
assert(0);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
|
||||||
|
assert(0);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
|
||||||
|
assert(0);
|
||||||
|
return;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// Some Template specialization
|
// Some Template specialization
|
||||||
template < typename vtype >
|
|
||||||
void permute(vtype &a, vtype b, int perm) {
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
//Complex float Reduce
|
//Complex float Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
|
inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
|
||||||
return 0;
|
float32x4_t v1; // two complex
|
||||||
|
v1 = Optimization::Permute::Permute0(in);
|
||||||
|
v1 = vaddq_f32(v1,in);
|
||||||
|
u128f conv; conv.v=v1;
|
||||||
|
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||||
}
|
}
|
||||||
//Real float Reduce
|
//Real float Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
|
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
|
||||||
float32x2_t high = vget_high_f32(in);
|
return vaddvq_f32(in);
|
||||||
float32x2_t low = vget_low_f32(in);
|
|
||||||
float32x2_t tmp = vadd_f32(low, high);
|
|
||||||
float32x2_t sum = vpadd_f32(tmp, tmp);
|
|
||||||
return vget_lane_f32(sum,0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//Complex double Reduce
|
//Complex double Reduce
|
||||||
template<>
|
template<> // N:by Boyle
|
||||||
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
|
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
|
||||||
return 0;
|
u128d conv; conv.v = in;
|
||||||
|
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
//Real double Reduce
|
//Real double Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
|
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
|
||||||
float64x2_t sum = vpaddq_f64(in, in);
|
return vaddvq_f64(in);
|
||||||
return vgetq_lane_f64(sum,0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Integer Reduce
|
//Integer Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
|
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
|
||||||
// FIXME unimplemented
|
// FIXME unimplemented
|
||||||
printf("Reduce : Missing integer implementation -> FIX\n");
|
printf("Reduce : Missing integer implementation -> FIX\n");
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Here assign types
|
// Here assign types
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
|
// typedef Optimization::vech SIMD_Htype; // Reduced precision type
|
||||||
|
typedef float16x8_t SIMD_Htype; // Half precision type
|
||||||
typedef float32x4_t SIMD_Ftype; // Single precision type
|
typedef float32x4_t SIMD_Ftype; // Single precision type
|
||||||
typedef float64x2_t SIMD_Dtype; // Double precision type
|
typedef float64x2_t SIMD_Dtype; // Double precision type
|
||||||
typedef uint32x4_t SIMD_Itype; // Integer type
|
typedef uint32x4_t SIMD_Itype; // Integer type
|
||||||
@ -312,13 +580,6 @@ namespace Grid {
|
|||||||
inline void prefetch_HINT_T0(const char *ptr){};
|
inline void prefetch_HINT_T0(const char *ptr){};
|
||||||
|
|
||||||
|
|
||||||
// Gpermute function
|
|
||||||
template < typename VectorSIMD >
|
|
||||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
|
||||||
Optimization::permute(y.v,b.v,perm);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Function name aliases
|
// Function name aliases
|
||||||
typedef Optimization::Vsplat VsplatSIMD;
|
typedef Optimization::Vsplat VsplatSIMD;
|
||||||
typedef Optimization::Vstore VstoreSIMD;
|
typedef Optimization::Vstore VstoreSIMD;
|
||||||
@ -326,16 +587,21 @@ namespace Grid {
|
|||||||
typedef Optimization::Vstream VstreamSIMD;
|
typedef Optimization::Vstream VstreamSIMD;
|
||||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Arithmetic operations
|
// Arithmetic operations
|
||||||
typedef Optimization::Sum SumSIMD;
|
typedef Optimization::Sum SumSIMD;
|
||||||
typedef Optimization::Sub SubSIMD;
|
typedef Optimization::Sub SubSIMD;
|
||||||
|
typedef Optimization::Div DivSIMD;
|
||||||
typedef Optimization::Mult MultSIMD;
|
typedef Optimization::Mult MultSIMD;
|
||||||
typedef Optimization::MultComplex MultComplexSIMD;
|
typedef Optimization::MultComplex MultComplexSIMD;
|
||||||
|
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||||
|
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||||
typedef Optimization::Conj ConjSIMD;
|
typedef Optimization::Conj ConjSIMD;
|
||||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||||
typedef Optimization::TimesI TimesISIMD;
|
typedef Optimization::TimesI TimesISIMD;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//#endif // ARM_NEON
|
||||||
|
@ -328,15 +328,15 @@ class Grid_simd {
|
|||||||
///////////////////////////////////////
|
///////////////////////////////////////
|
||||||
|
|
||||||
//#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
|
//#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
|
||||||
//#pragma GCC push_options
|
//#pragma GCC push_options
|
||||||
//#pragma GCC optimize ("O0")
|
//#pragma GCC optimize ("O0")
|
||||||
//#endif
|
//#endif
|
||||||
template <class functor>
|
template <class functor>
|
||||||
friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
|
friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
|
||||||
Grid_simd ret;
|
Grid_simd ret;
|
||||||
Grid_simd::conv_t conv;
|
Grid_simd::conv_t conv;
|
||||||
Grid_simd::scalar_type s;
|
Grid_simd::scalar_type s;
|
||||||
|
|
||||||
conv.v = v.v;
|
conv.v = v.v;
|
||||||
for (int i = 0; i < Nsimd(); i++) {
|
for (int i = 0; i < Nsimd(); i++) {
|
||||||
s = conv.s[i];
|
s = conv.s[i];
|
||||||
@ -368,7 +368,7 @@ class Grid_simd {
|
|||||||
//#pragma GCC pop_options
|
//#pragma GCC pop_options
|
||||||
//#endif
|
//#endif
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Exchange
|
// Exchange
|
||||||
// Al Ah , Bl Bh -> Al Bl Ah,Bh
|
// Al Ah , Bl Bh -> Al Bl Ah,Bh
|
||||||
///////////////////////
|
///////////////////////
|
||||||
friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
|
friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
|
||||||
@ -379,7 +379,7 @@ class Grid_simd {
|
|||||||
Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
|
Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
|
||||||
} else if(n==1) {
|
} else if(n==1) {
|
||||||
Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
|
Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
|
||||||
} else if(n==0) {
|
} else if(n==0) {
|
||||||
Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
|
Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -406,7 +406,7 @@ class Grid_simd {
|
|||||||
int dist = perm & 0xF;
|
int dist = perm & 0xF;
|
||||||
y = rotate(b, dist);
|
y = rotate(b, dist);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if(perm==3) permute3(y, b);
|
else if(perm==3) permute3(y, b);
|
||||||
else if(perm==2) permute2(y, b);
|
else if(perm==2) permute2(y, b);
|
||||||
else if(perm==1) permute1(y, b);
|
else if(perm==1) permute1(y, b);
|
||||||
@ -425,7 +425,7 @@ class Grid_simd {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}; // end of Grid_simd class definition
|
}; // end of Grid_simd class definition
|
||||||
|
|
||||||
|
|
||||||
@ -451,29 +451,29 @@ inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
|
|||||||
ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
|
ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
template <class S, class V, IfNotComplex<S> =0>
|
template <class S, class V, IfNotComplex<S> =0>
|
||||||
inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
|
inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
|
||||||
{
|
{
|
||||||
nrot = nrot % Grid_simd<S,V>::Nsimd();
|
nrot = nrot % Grid_simd<S,V>::Nsimd();
|
||||||
ret.v = Optimization::Rotate::rotate(b.v,nrot);
|
ret.v = Optimization::Rotate::rotate(b.v,nrot);
|
||||||
}
|
}
|
||||||
template <class S, class V, IfComplex<S> =0>
|
template <class S, class V, IfComplex<S> =0>
|
||||||
inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
|
inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
|
||||||
{
|
{
|
||||||
nrot = nrot % Grid_simd<S,V>::Nsimd();
|
nrot = nrot % Grid_simd<S,V>::Nsimd();
|
||||||
ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
|
ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class S, class V>
|
template <class S, class V>
|
||||||
inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
|
inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
|
||||||
S* typepun =(S*) &src;
|
S* typepun =(S*) &src;
|
||||||
vsplat(ret,typepun[lane]);
|
vsplat(ret,typepun[lane]);
|
||||||
}
|
}
|
||||||
template <class S, class V, IfComplex<S> =0>
|
template <class S, class V, IfComplex<S> =0>
|
||||||
inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
|
inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
|
||||||
S* typepun =(S*) &src;
|
S* typepun =(S*) &src;
|
||||||
ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
|
ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -604,13 +604,27 @@ inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
|
|||||||
ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
|
ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
// TEST for Test_simd
|
||||||
|
template <class S, class V, IfComplex<S> = 0>
|
||||||
|
inline Grid_simd<S, V> real_mult(std::complex<S> a, std::complex<S> b) {
|
||||||
|
Grid_simd<S, V> ret;
|
||||||
|
//ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
template <class S, class V, IfComplex<S> = 0>
|
template <class S, class V, IfComplex<S> = 0>
|
||||||
inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
|
inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
|
||||||
Grid_simd<S, V> ret;
|
Grid_simd<S, V> ret;
|
||||||
ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
|
ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
// TEST for Test_simd
|
||||||
|
template <class S, class V, IfComplex<S> = 0>
|
||||||
|
inline Grid_simd<S, V> real_madd(std::complex<S> a, std::complex<S> b) {
|
||||||
|
Grid_simd<S, V> ret;
|
||||||
|
//ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
// Distinguish between complex types and others
|
// Distinguish between complex types and others
|
||||||
template <class S, class V, IfComplex<S> = 0>
|
template <class S, class V, IfComplex<S> = 0>
|
||||||
@ -640,7 +654,7 @@ inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
|
|||||||
ret = a * conjugate(b) ;
|
ret = a * conjugate(b) ;
|
||||||
den = b * conjugate(b) ;
|
den = b * conjugate(b) ;
|
||||||
|
|
||||||
|
|
||||||
auto real_den = toReal(den);
|
auto real_den = toReal(den);
|
||||||
|
|
||||||
ret.v=binary<V>(ret.v, real_den.v, DivSIMD());
|
ret.v=binary<V>(ret.v, real_den.v, DivSIMD());
|
||||||
|
Loading…
Reference in New Issue
Block a user