mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Merge branch 'develop' of https://github.com/paboyle/Grid into develop
This commit is contained in:
commit
a0be3f7330
14
README.md
14
README.md
@ -18,7 +18,7 @@
|
|||||||
|
|
||||||
License: GPL v2.
|
License: GPL v2.
|
||||||
|
|
||||||
Last update Nov 2016.
|
Last update June 2017.
|
||||||
|
|
||||||
_Please do not send pull requests to the `master` branch which is reserved for releases._
|
_Please do not send pull requests to the `master` branch which is reserved for releases._
|
||||||
|
|
||||||
@ -78,13 +78,16 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
|
|||||||
for most programmers.
|
for most programmers.
|
||||||
|
|
||||||
The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
|
The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
|
||||||
Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
|
Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
|
||||||
|
|
||||||
These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
|
These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types.
|
||||||
The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
|
The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
|
||||||
|
|
||||||
MPI, OpenMP, and SIMD parallelism are present in the library.
|
MPI, OpenMP, and SIMD parallelism are present in the library.
|
||||||
Please see https://arxiv.org/abs/1512.03487 for more detail.
|
Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
|
||||||
|
|
||||||
|
### Required libraries
|
||||||
|
Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed.
|
||||||
|
|
||||||
### Quick start
|
### Quick start
|
||||||
First, start by cloning the repository:
|
First, start by cloning the repository:
|
||||||
@ -173,7 +176,8 @@ The following options can be use with the `--enable-simd=` option to target diff
|
|||||||
| `AVXFMA4` | AVX (256 bit) + FMA4 |
|
| `AVXFMA4` | AVX (256 bit) + FMA4 |
|
||||||
| `AVX2` | AVX 2 (256 bit) |
|
| `AVX2` | AVX 2 (256 bit) |
|
||||||
| `AVX512` | AVX 512 bit |
|
| `AVX512` | AVX 512 bit |
|
||||||
| `QPX` | QPX (256 bit) |
|
| `NEONv8` | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit) |
|
||||||
|
| `QPX` | IBM QPX (256 bit) |
|
||||||
|
|
||||||
Alternatively, some CPU codenames can be directly used:
|
Alternatively, some CPU codenames can be directly used:
|
||||||
|
|
||||||
|
@ -249,6 +249,9 @@ case ${ax_cv_cxx_compiler_vendor} in
|
|||||||
[generic SIMD vector width (in bytes)])
|
[generic SIMD vector width (in bytes)])
|
||||||
SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
|
SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
|
||||||
SIMD_FLAGS='';;
|
SIMD_FLAGS='';;
|
||||||
|
NEONv8)
|
||||||
|
AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
|
||||||
|
SIMD_FLAGS='-march=armv8-a';;
|
||||||
QPX|BGQ)
|
QPX|BGQ)
|
||||||
AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
|
AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
|
||||||
SIMD_FLAGS='';;
|
SIMD_FLAGS='';;
|
||||||
|
@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
|
|||||||
if (maxTau - taus < epsilon){
|
if (maxTau - taus < epsilon){
|
||||||
epsilon = maxTau-taus;
|
epsilon = maxTau-taus;
|
||||||
}
|
}
|
||||||
std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
|
//std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
|
||||||
GaugeField Z(U._grid);
|
GaugeField Z(U._grid);
|
||||||
GaugeField Zprime(U._grid);
|
GaugeField Zprime(U._grid);
|
||||||
GaugeField tmp(U._grid), Uprime(U._grid);
|
GaugeField tmp(U._grid), Uprime(U._grid);
|
||||||
@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
|
|||||||
// adjust integration step
|
// adjust integration step
|
||||||
|
|
||||||
taus += epsilon;
|
taus += epsilon;
|
||||||
std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
|
//std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
|
||||||
|
|
||||||
epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
|
epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
|
||||||
std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
|
//std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
|
|||||||
out = in;
|
out = in;
|
||||||
for (unsigned int step = 1; step <= Nstep; step++) {
|
for (unsigned int step = 1; step <= Nstep; step++) {
|
||||||
auto start = std::chrono::high_resolution_clock::now();
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl;
|
|
||||||
evolve_step(out);
|
evolve_step(out);
|
||||||
auto end = std::chrono::high_resolution_clock::now();
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
std::chrono::duration<double> diff = end - start;
|
std::chrono::duration<double> diff = end - start;
|
||||||
@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
|
|||||||
unsigned int step = 0;
|
unsigned int step = 0;
|
||||||
do{
|
do{
|
||||||
step++;
|
step++;
|
||||||
std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
|
//std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
|
||||||
evolve_step_adaptive(out, maxTau);
|
evolve_step_adaptive(out, maxTau);
|
||||||
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
|
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
|
||||||
<< step << " "
|
<< step << " "
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
Copyright (C) 2015
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de>
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
Author: neo <cossu@post.kek.jp>
|
Author: neo <cossu@post.kek.jp>
|
||||||
|
|
||||||
@ -26,19 +27,25 @@ Author: neo <cossu@post.kek.jp>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
//----------------------------------------------------------------------
|
|
||||||
/*! @file Grid_sse4.h
|
|
||||||
@brief Optimization libraries for NEON (ARM) instructions set ARMv8
|
|
||||||
|
|
||||||
Experimental - Using intrinsics - DEVELOPING!
|
/*
|
||||||
|
|
||||||
|
ARMv8 NEON intrinsics layer by
|
||||||
|
|
||||||
|
Nils Meyer <nils.meyer@ur.de>,
|
||||||
|
University of Regensburg, Germany
|
||||||
|
SFB/TRR55
|
||||||
|
|
||||||
*/
|
*/
|
||||||
// Time-stamp: <2015-07-10 17:45:09 neo>
|
|
||||||
//----------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
#ifndef GEN_SIMD_WIDTH
|
||||||
|
#define GEN_SIMD_WIDTH 16u
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "Grid_generic_types.h"
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
// ARMv8 supports double precision
|
namespace Grid {
|
||||||
|
|
||||||
namespace Optimization {
|
namespace Optimization {
|
||||||
|
|
||||||
template<class vtype>
|
template<class vtype>
|
||||||
@ -46,14 +53,18 @@ namespace Optimization {
|
|||||||
float32x4_t f;
|
float32x4_t f;
|
||||||
vtype v;
|
vtype v;
|
||||||
};
|
};
|
||||||
|
|
||||||
union u128f {
|
union u128f {
|
||||||
float32x4_t v;
|
float32x4_t v;
|
||||||
float f[4];
|
float f[4];
|
||||||
};
|
};
|
||||||
union u128d {
|
union u128d {
|
||||||
float64x2_t v;
|
float64x2_t v;
|
||||||
double f[4];
|
double f[2];
|
||||||
|
};
|
||||||
|
// half precision
|
||||||
|
union u128h {
|
||||||
|
float16x8_t v;
|
||||||
|
uint16_t f[8];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Vsplat{
|
struct Vsplat{
|
||||||
@ -64,20 +75,20 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
// Real float
|
// Real float
|
||||||
inline float32x4_t operator()(float a){
|
inline float32x4_t operator()(float a){
|
||||||
return vld1q_dup_f32(&a);
|
return vdupq_n_f32(a);
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
inline float32x4_t operator()(double a, double b){
|
inline float64x2_t operator()(double a, double b){
|
||||||
float tmp[4]={(float)a,(float)b,(float)a,(float)b};
|
double tmp[2]={a,b};
|
||||||
return vld1q_f32(tmp);
|
return vld1q_f64(tmp);
|
||||||
}
|
}
|
||||||
//Real double
|
//Real double // N:tbc
|
||||||
inline float32x4_t operator()(double a){
|
inline float64x2_t operator()(double a){
|
||||||
return vld1q_dup_f32(&a);
|
return vdupq_n_f64(a);
|
||||||
}
|
}
|
||||||
//Integer
|
//Integer // N:tbc
|
||||||
inline uint32x4_t operator()(Integer a){
|
inline uint32x4_t operator()(Integer a){
|
||||||
return vld1q_dup_u32(&a);
|
return vdupq_n_u32(a);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -87,8 +98,8 @@ namespace Optimization {
|
|||||||
vst1q_f32(F, a);
|
vst1q_f32(F, a);
|
||||||
}
|
}
|
||||||
//Double
|
//Double
|
||||||
inline void operator()(float32x4_t a, double* D){
|
inline void operator()(float64x2_t a, double* D){
|
||||||
vst1q_f32((float*)D, a);
|
vst1q_f64(D, a);
|
||||||
}
|
}
|
||||||
//Integer
|
//Integer
|
||||||
inline void operator()(uint32x4_t a, Integer* I){
|
inline void operator()(uint32x4_t a, Integer* I){
|
||||||
@ -97,49 +108,49 @@ namespace Optimization {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Vstream{
|
struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
|
||||||
//Float
|
//Float // N:generic
|
||||||
inline void operator()(float * a, float32x4_t b){
|
inline void operator()(float * a, float32x4_t b){
|
||||||
|
memcpy(a,&b,4*sizeof(float));
|
||||||
}
|
}
|
||||||
//Double
|
//Double // N:generic
|
||||||
inline void operator()(double * a, float32x4_t b){
|
inline void operator()(double * a, float64x2_t b){
|
||||||
|
memcpy(a,&b,2*sizeof(double));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Nils: Vset untested; not used currently in Grid at all;
|
||||||
|
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
|
||||||
struct Vset{
|
struct Vset{
|
||||||
// Complex float
|
// Complex float // N:ok
|
||||||
inline float32x4_t operator()(Grid::ComplexF *a){
|
inline float32x4_t operator()(Grid::ComplexF *a){
|
||||||
float32x4_t foo;
|
float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
|
||||||
return foo;
|
return vld1q_f32(tmp);
|
||||||
}
|
}
|
||||||
// Complex double
|
// Complex double // N:ok
|
||||||
inline float32x4_t operator()(Grid::ComplexD *a){
|
inline float64x2_t operator()(Grid::ComplexD *a){
|
||||||
float32x4_t foo;
|
double tmp[2]={a[0].imag(),a[0].real()};
|
||||||
return foo;
|
return vld1q_f64(tmp);
|
||||||
}
|
}
|
||||||
// Real float
|
// Real float // N:ok
|
||||||
inline float32x4_t operator()(float *a){
|
inline float32x4_t operator()(float *a){
|
||||||
float32x4_t foo;
|
float tmp[4]={a[3],a[2],a[1],a[0]};
|
||||||
return foo;
|
return vld1q_f32(tmp);
|
||||||
}
|
}
|
||||||
// Real double
|
// Real double // N:ok
|
||||||
inline float32x4_t operator()(double *a){
|
inline float64x2_t operator()(double *a){
|
||||||
float32x4_t foo;
|
double tmp[2]={a[1],a[0]};
|
||||||
return foo;
|
return vld1q_f64(tmp);
|
||||||
}
|
}
|
||||||
// Integer
|
// Integer // N:ok
|
||||||
inline uint32x4_t operator()(Integer *a){
|
inline uint32x4_t operator()(Integer *a){
|
||||||
uint32x4_t foo;
|
return vld1q_dup_u32(a);
|
||||||
return foo;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// N:leaving as is
|
||||||
template <typename Out_type, typename In_type>
|
template <typename Out_type, typename In_type>
|
||||||
struct Reduce{
|
struct Reduce{
|
||||||
//Need templated class to overload output type
|
//Need templated class to overload output type
|
||||||
@ -184,26 +195,98 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct MultRealPart{
|
||||||
|
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||||
|
float32x4_t re = vtrn1q_f32(a, a);
|
||||||
|
return vmulq_f32(re, b);
|
||||||
|
}
|
||||||
|
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||||
|
float64x2_t re = vzip1q_f64(a, a);
|
||||||
|
return vmulq_f64(re, b);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct MaddRealPart{
|
||||||
|
inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
|
||||||
|
float32x4_t re = vtrn1q_f32(a, a);
|
||||||
|
return vfmaq_f32(c, re, b);
|
||||||
|
}
|
||||||
|
inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
|
||||||
|
float64x2_t re = vzip1q_f64(a, a);
|
||||||
|
return vfmaq_f64(c, re, b);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Div{
|
||||||
|
// Real float
|
||||||
|
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||||
|
return vdivq_f32(a, b);
|
||||||
|
}
|
||||||
|
// Real double
|
||||||
|
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||||
|
return vdivq_f64(a, b);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct MultComplex{
|
struct MultComplex{
|
||||||
// Complex float
|
// Complex float
|
||||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||||
float32x4_t foo;
|
|
||||||
return foo;
|
float32x4_t r0, r1, r2, r3, r4;
|
||||||
|
|
||||||
|
// a = ar ai Ar Ai
|
||||||
|
// b = br bi Br Bi
|
||||||
|
// collect real/imag part, negate bi and Bi
|
||||||
|
r0 = vtrn1q_f32(b, b); // br br Br Br
|
||||||
|
r1 = vnegq_f32(b); // -br -bi -Br -Bi
|
||||||
|
r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi
|
||||||
|
|
||||||
|
// the fun part
|
||||||
|
r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ...
|
||||||
|
r4 = vrev64q_f32(r3); // -bi*ai bi*ar ...
|
||||||
|
|
||||||
|
// fma(a,b,c) = a+b*c
|
||||||
|
return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ...
|
||||||
|
|
||||||
|
// no fma, use mul and add
|
||||||
|
//float32x4_t r5;
|
||||||
|
//r5 = vmulq_f32(r0, a);
|
||||||
|
//return vaddq_f32(r4, r5);
|
||||||
}
|
}
|
||||||
// Complex double
|
// Complex double
|
||||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||||
float32x4_t foo;
|
|
||||||
return foo;
|
float64x2_t r0, r1, r2, r3, r4;
|
||||||
|
|
||||||
|
// b = br bi
|
||||||
|
// collect real/imag part, negate bi
|
||||||
|
r0 = vtrn1q_f64(b, b); // br br
|
||||||
|
r1 = vnegq_f64(b); // -br -bi
|
||||||
|
r2 = vtrn2q_f64(b, r1); // bi -bi
|
||||||
|
|
||||||
|
// the fun part
|
||||||
|
r3 = vmulq_f64(r2, a); // bi*ar -bi*ai
|
||||||
|
r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar
|
||||||
|
|
||||||
|
// fma(a,b,c) = a+b*c
|
||||||
|
return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi
|
||||||
|
|
||||||
|
// no fma, use mul and add
|
||||||
|
//float64x2_t r5;
|
||||||
|
//r5 = vmulq_f64(r0, a);
|
||||||
|
//return vaddq_f64(r4, r5);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Mult{
|
struct Mult{
|
||||||
// Real float
|
// Real float
|
||||||
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
|
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
|
||||||
return vaddq_f32(vmulq_f32(b,c),a);
|
//return vaddq_f32(vmulq_f32(b,c),a);
|
||||||
|
return vfmaq_f32(a, b, c);
|
||||||
}
|
}
|
||||||
inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
|
inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
|
||||||
return vaddq_f64(vmulq_f64(b,c),a);
|
//return vaddq_f64(vmulq_f64(b,c),a);
|
||||||
|
return vfmaq_f64(a, b, c);
|
||||||
}
|
}
|
||||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||||
return vmulq_f32(a,b);
|
return vmulq_f32(a,b);
|
||||||
@ -221,74 +304,259 @@ namespace Optimization {
|
|||||||
struct Conj{
|
struct Conj{
|
||||||
// Complex single
|
// Complex single
|
||||||
inline float32x4_t operator()(float32x4_t in){
|
inline float32x4_t operator()(float32x4_t in){
|
||||||
return in;
|
// ar ai br bi -> ar -ai br -bi
|
||||||
|
float32x4_t r0, r1;
|
||||||
|
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||||
|
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
|
||||||
|
return vtrn1q_f32(in, r1); // ar -ai br -bi
|
||||||
}
|
}
|
||||||
// Complex double
|
// Complex double
|
||||||
//inline float32x4_t operator()(float32x4_t in){
|
inline float64x2_t operator()(float64x2_t in){
|
||||||
// return 0;
|
|
||||||
//}
|
float64x2_t r0, r1;
|
||||||
|
r0 = vextq_f64(in, in, 1); // ai ar
|
||||||
|
r1 = vnegq_f64(r0); // -ai -ar
|
||||||
|
return vextq_f64(r0, r1, 1); // ar -ai
|
||||||
|
}
|
||||||
// do not define for integer input
|
// do not define for integer input
|
||||||
};
|
};
|
||||||
|
|
||||||
struct TimesMinusI{
|
struct TimesMinusI{
|
||||||
//Complex single
|
//Complex single
|
||||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||||
return in;
|
// ar ai br bi -> ai -ar ai -br
|
||||||
|
float32x4_t r0, r1;
|
||||||
|
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||||
|
r1 = vrev64q_f32(in); // ai ar bi br
|
||||||
|
return vtrn1q_f32(r1, r0); // ar -ai br -bi
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
//inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
|
||||||
// return in;
|
// a ib -> b -ia
|
||||||
//}
|
float64x2_t tmp;
|
||||||
|
tmp = vnegq_f64(in);
|
||||||
|
return vextq_f64(in, tmp, 1);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct TimesI{
|
struct TimesI{
|
||||||
//Complex single
|
//Complex single
|
||||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||||
//need shuffle
|
// ar ai br bi -> -ai ar -bi br
|
||||||
return in;
|
float32x4_t r0, r1;
|
||||||
|
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||||
|
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
|
||||||
|
return vtrn1q_f32(r1, in); // -ai ar -bi br
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
//inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
|
||||||
// return 0;
|
// a ib -> -b ia
|
||||||
//}
|
float64x2_t tmp;
|
||||||
|
tmp = vnegq_f64(in);
|
||||||
|
return vextq_f64(tmp, in, 1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Permute{
|
||||||
|
|
||||||
|
static inline float32x4_t Permute0(float32x4_t in){ // N:ok
|
||||||
|
// AB CD -> CD AB
|
||||||
|
return vextq_f32(in, in, 2);
|
||||||
|
};
|
||||||
|
static inline float32x4_t Permute1(float32x4_t in){ // N:ok
|
||||||
|
// AB CD -> BA DC
|
||||||
|
return vrev64q_f32(in);
|
||||||
|
};
|
||||||
|
static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline float64x2_t Permute0(float64x2_t in){ // N:ok
|
||||||
|
// AB -> BA
|
||||||
|
return vextq_f64(in, in, 1);
|
||||||
|
};
|
||||||
|
static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Rotate{
|
||||||
|
|
||||||
|
static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
|
||||||
|
switch(n){
|
||||||
|
case 0: // AB CD -> AB CD
|
||||||
|
return tRotate<0>(in);
|
||||||
|
break;
|
||||||
|
case 1: // AB CD -> BC DA
|
||||||
|
return tRotate<1>(in);
|
||||||
|
break;
|
||||||
|
case 2: // AB CD -> CD AB
|
||||||
|
return tRotate<2>(in);
|
||||||
|
break;
|
||||||
|
case 3: // AB CD -> DA BC
|
||||||
|
return tRotate<3>(in);
|
||||||
|
break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
|
||||||
|
switch(n){
|
||||||
|
case 0: // AB -> AB
|
||||||
|
return tRotate<0>(in);
|
||||||
|
break;
|
||||||
|
case 1: // AB -> BA
|
||||||
|
return tRotate<1>(in);
|
||||||
|
break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// working, but no restriction on n
|
||||||
|
// template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
|
||||||
|
// template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
|
||||||
|
|
||||||
|
// restriction on n
|
||||||
|
template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
|
||||||
|
template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
struct PrecisionChange {
|
||||||
|
|
||||||
|
static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
|
||||||
|
float16x4_t h = vcvt_f16_f32(a);
|
||||||
|
return vcvt_high_f16_f32(h, b);
|
||||||
|
}
|
||||||
|
static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
|
||||||
|
sb = vcvt_high_f32_f16(h);
|
||||||
|
// there is no direct conversion from lower float32x4_t to float64x2_t
|
||||||
|
// vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
|
||||||
|
//float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
|
||||||
|
// workaround for clang
|
||||||
|
uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
|
||||||
|
float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
|
||||||
|
sa = vcvt_high_f32_f16(h1);
|
||||||
|
}
|
||||||
|
static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
|
||||||
|
float32x2_t s = vcvt_f32_f64(a);
|
||||||
|
return vcvt_high_f32_f64(s, b);
|
||||||
|
|
||||||
|
}
|
||||||
|
static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
|
||||||
|
b = vcvt_high_f64_f32(s);
|
||||||
|
// there is no direct conversion from lower float32x4_t to float64x2_t
|
||||||
|
float32x4_t s1 = vextq_f32(s, s, 2);
|
||||||
|
a = vcvt_high_f64_f32(s1);
|
||||||
|
|
||||||
|
}
|
||||||
|
static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
|
||||||
|
float32x4_t s1 = DtoS(a, b);
|
||||||
|
float32x4_t s2 = DtoS(c, d);
|
||||||
|
return StoH(s1, s2);
|
||||||
|
}
|
||||||
|
static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
|
||||||
|
float32x4_t s1, s2;
|
||||||
|
HtoS(h, s1, s2);
|
||||||
|
StoD(s1, a, b);
|
||||||
|
StoD(s2, c, d);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
// Exchange support
|
||||||
|
|
||||||
|
struct Exchange{
|
||||||
|
static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||||
|
// in1: ABCD -> out1: ABEF
|
||||||
|
// in2: EFGH -> out2: CDGH
|
||||||
|
|
||||||
|
// z: CDAB
|
||||||
|
float32x4_t z = vextq_f32(in1, in1, 2);
|
||||||
|
// out1: ABEF
|
||||||
|
out1 = vextq_f32(z, in2, 2);
|
||||||
|
|
||||||
|
// z: GHEF
|
||||||
|
z = vextq_f32(in2, in2, 2);
|
||||||
|
// out2: CDGH
|
||||||
|
out2 = vextq_f32(in1, z, 2);
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||||
|
// in1: ABCD -> out1: AECG
|
||||||
|
// in2: EFGH -> out2: BFDH
|
||||||
|
out1 = vtrn1q_f32(in1, in2);
|
||||||
|
out2 = vtrn2q_f32(in1, in2);
|
||||||
|
};
|
||||||
|
static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||||
|
assert(0);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||||
|
assert(0);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
// double precision
|
||||||
|
static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
|
||||||
|
// in1: AB -> out1: AC
|
||||||
|
// in2: CD -> out2: BD
|
||||||
|
out1 = vzip1q_f64(in1, in2);
|
||||||
|
out2 = vzip2q_f64(in1, in2);
|
||||||
|
};
|
||||||
|
static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
|
||||||
|
assert(0);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
|
||||||
|
assert(0);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
|
||||||
|
assert(0);
|
||||||
|
return;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// Some Template specialization
|
// Some Template specialization
|
||||||
template < typename vtype >
|
|
||||||
void permute(vtype &a, vtype b, int perm) {
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
//Complex float Reduce
|
//Complex float Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
|
inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
|
||||||
return 0;
|
float32x4_t v1; // two complex
|
||||||
|
v1 = Optimization::Permute::Permute0(in);
|
||||||
|
v1 = vaddq_f32(v1,in);
|
||||||
|
u128f conv; conv.v=v1;
|
||||||
|
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||||
}
|
}
|
||||||
//Real float Reduce
|
//Real float Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
|
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
|
||||||
float32x2_t high = vget_high_f32(in);
|
return vaddvq_f32(in);
|
||||||
float32x2_t low = vget_low_f32(in);
|
|
||||||
float32x2_t tmp = vadd_f32(low, high);
|
|
||||||
float32x2_t sum = vpadd_f32(tmp, tmp);
|
|
||||||
return vget_lane_f32(sum,0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//Complex double Reduce
|
//Complex double Reduce
|
||||||
template<>
|
template<> // N:by Boyle
|
||||||
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
|
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
|
||||||
return 0;
|
u128d conv; conv.v = in;
|
||||||
|
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
//Real double Reduce
|
//Real double Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
|
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
|
||||||
float64x2_t sum = vpaddq_f64(in, in);
|
return vaddvq_f64(in);
|
||||||
return vgetq_lane_f64(sum,0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Integer Reduce
|
//Integer Reduce
|
||||||
@ -302,8 +570,9 @@ namespace Optimization {
|
|||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Here assign types
|
// Here assign types
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
|
// typedef Optimization::vech SIMD_Htype; // Reduced precision type
|
||||||
|
typedef float16x8_t SIMD_Htype; // Half precision type
|
||||||
typedef float32x4_t SIMD_Ftype; // Single precision type
|
typedef float32x4_t SIMD_Ftype; // Single precision type
|
||||||
typedef float64x2_t SIMD_Dtype; // Double precision type
|
typedef float64x2_t SIMD_Dtype; // Double precision type
|
||||||
typedef uint32x4_t SIMD_Itype; // Integer type
|
typedef uint32x4_t SIMD_Itype; // Integer type
|
||||||
@ -312,13 +581,6 @@ namespace Grid {
|
|||||||
inline void prefetch_HINT_T0(const char *ptr){};
|
inline void prefetch_HINT_T0(const char *ptr){};
|
||||||
|
|
||||||
|
|
||||||
// Gpermute function
|
|
||||||
template < typename VectorSIMD >
|
|
||||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
|
||||||
Optimization::permute(y.v,b.v,perm);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Function name aliases
|
// Function name aliases
|
||||||
typedef Optimization::Vsplat VsplatSIMD;
|
typedef Optimization::Vsplat VsplatSIMD;
|
||||||
typedef Optimization::Vstore VstoreSIMD;
|
typedef Optimization::Vstore VstoreSIMD;
|
||||||
@ -332,8 +594,11 @@ namespace Grid {
|
|||||||
// Arithmetic operations
|
// Arithmetic operations
|
||||||
typedef Optimization::Sum SumSIMD;
|
typedef Optimization::Sum SumSIMD;
|
||||||
typedef Optimization::Sub SubSIMD;
|
typedef Optimization::Sub SubSIMD;
|
||||||
|
typedef Optimization::Div DivSIMD;
|
||||||
typedef Optimization::Mult MultSIMD;
|
typedef Optimization::Mult MultSIMD;
|
||||||
typedef Optimization::MultComplex MultComplexSIMD;
|
typedef Optimization::MultComplex MultComplexSIMD;
|
||||||
|
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||||
|
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||||
typedef Optimization::Conj ConjSIMD;
|
typedef Optimization::Conj ConjSIMD;
|
||||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||||
typedef Optimization::TimesI TimesISIMD;
|
typedef Optimization::TimesI TimesISIMD;
|
||||||
|
@ -53,7 +53,7 @@ directory
|
|||||||
#if defined IMCI
|
#if defined IMCI
|
||||||
#include "Grid_imci.h"
|
#include "Grid_imci.h"
|
||||||
#endif
|
#endif
|
||||||
#ifdef NEONv8
|
#ifdef NEONV8
|
||||||
#include "Grid_neon.h"
|
#include "Grid_neon.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined QPX
|
#if defined QPX
|
||||||
|
Loading…
Reference in New Issue
Block a user