1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

Merge branch 'develop' of https://github.com/paboyle/Grid into develop

This commit is contained in:
Peter Boyle 2017-06-30 10:53:50 +01:00
commit a0be3f7330
5 changed files with 385 additions and 114 deletions

View File

@ -18,7 +18,7 @@
License: GPL v2. License: GPL v2.
Last update Nov 2016. Last update June 2017.
_Please do not send pull requests to the `master` branch which is reserved for releases._ _Please do not send pull requests to the `master` branch which is reserved for releases._
@ -78,13 +78,16 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
for most programmers. for most programmers.
The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way). Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers. These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types.
The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`. The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
MPI, OpenMP, and SIMD parallelism are present in the library. MPI, OpenMP, and SIMD parallelism are present in the library.
Please see https://arxiv.org/abs/1512.03487 for more detail. Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
### Required libraries
Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed.
### Quick start ### Quick start
First, start by cloning the repository: First, start by cloning the repository:
@ -173,7 +176,8 @@ The following options can be use with the `--enable-simd=` option to target diff
| `AVXFMA4` | AVX (256 bit) + FMA4 | | `AVXFMA4` | AVX (256 bit) + FMA4 |
| `AVX2` | AVX 2 (256 bit) | | `AVX2` | AVX 2 (256 bit) |
| `AVX512` | AVX 512 bit | | `AVX512` | AVX 512 bit |
| `QPX` | QPX (256 bit) | | `NEONv8` | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit) |
| `QPX` | IBM QPX (256 bit) |
Alternatively, some CPU codenames can be directly used: Alternatively, some CPU codenames can be directly used:

View File

@ -249,6 +249,9 @@ case ${ax_cv_cxx_compiler_vendor} in
[generic SIMD vector width (in bytes)]) [generic SIMD vector width (in bytes)])
SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)" SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
SIMD_FLAGS='';; SIMD_FLAGS='';;
NEONv8)
AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
SIMD_FLAGS='-march=armv8-a';;
QPX|BGQ) QPX|BGQ)
AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
SIMD_FLAGS='';; SIMD_FLAGS='';;

View File

@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
if (maxTau - taus < epsilon){ if (maxTau - taus < epsilon){
epsilon = maxTau-taus; epsilon = maxTau-taus;
} }
std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
GaugeField Z(U._grid); GaugeField Z(U._grid);
GaugeField Zprime(U._grid); GaugeField Zprime(U._grid);
GaugeField tmp(U._grid), Uprime(U._grid); GaugeField tmp(U._grid), Uprime(U._grid);
@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
// adjust integration step // adjust integration step
taus += epsilon; taus += epsilon;
std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.); epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
} }
@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
out = in; out = in;
for (unsigned int step = 1; step <= Nstep; step++) { for (unsigned int step = 1; step <= Nstep; step++) {
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl;
evolve_step(out); evolve_step(out);
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start; std::chrono::duration<double> diff = end - start;
@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
unsigned int step = 0; unsigned int step = 0;
do{ do{
step++; step++;
std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
evolve_step_adaptive(out, maxTau); evolve_step_adaptive(out, maxTau);
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
<< step << " " << step << " "

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -6,8 +6,9 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Nils Meyer <nils.meyer@ur.de>
Author: neo <cossu@post.kek.jp> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -26,19 +27,25 @@ Author: neo <cossu@post.kek.jp>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
//----------------------------------------------------------------------
/*! @file Grid_sse4.h
@brief Optimization libraries for NEON (ARM) instructions set ARMv8
Experimental - Using intrinsics - DEVELOPING! /*
ARMv8 NEON intrinsics layer by
Nils Meyer <nils.meyer@ur.de>,
University of Regensburg, Germany
SFB/TRR55
*/ */
// Time-stamp: <2015-07-10 17:45:09 neo>
//----------------------------------------------------------------------
#ifndef GEN_SIMD_WIDTH
#define GEN_SIMD_WIDTH 16u
#endif
#include "Grid_generic_types.h"
#include <arm_neon.h> #include <arm_neon.h>
// ARMv8 supports double precision namespace Grid {
namespace Optimization { namespace Optimization {
template<class vtype> template<class vtype>
@ -46,14 +53,18 @@ namespace Optimization {
float32x4_t f; float32x4_t f;
vtype v; vtype v;
}; };
union u128f { union u128f {
float32x4_t v; float32x4_t v;
float f[4]; float f[4];
}; };
union u128d { union u128d {
float64x2_t v; float64x2_t v;
double f[4]; double f[2];
};
// half precision
union u128h {
float16x8_t v;
uint16_t f[8];
}; };
struct Vsplat{ struct Vsplat{
@ -64,20 +75,20 @@ namespace Optimization {
} }
// Real float // Real float
inline float32x4_t operator()(float a){ inline float32x4_t operator()(float a){
return vld1q_dup_f32(&a); return vdupq_n_f32(a);
} }
//Complex double //Complex double
inline float32x4_t operator()(double a, double b){ inline float64x2_t operator()(double a, double b){
float tmp[4]={(float)a,(float)b,(float)a,(float)b}; double tmp[2]={a,b};
return vld1q_f32(tmp); return vld1q_f64(tmp);
} }
//Real double //Real double // N:tbc
inline float32x4_t operator()(double a){ inline float64x2_t operator()(double a){
return vld1q_dup_f32(&a); return vdupq_n_f64(a);
} }
//Integer //Integer // N:tbc
inline uint32x4_t operator()(Integer a){ inline uint32x4_t operator()(Integer a){
return vld1q_dup_u32(&a); return vdupq_n_u32(a);
} }
}; };
@ -87,8 +98,8 @@ namespace Optimization {
vst1q_f32(F, a); vst1q_f32(F, a);
} }
//Double //Double
inline void operator()(float32x4_t a, double* D){ inline void operator()(float64x2_t a, double* D){
vst1q_f32((float*)D, a); vst1q_f64(D, a);
} }
//Integer //Integer
inline void operator()(uint32x4_t a, Integer* I){ inline void operator()(uint32x4_t a, Integer* I){
@ -97,54 +108,54 @@ namespace Optimization {
}; };
struct Vstream{ struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
//Float //Float // N:generic
inline void operator()(float * a, float32x4_t b){ inline void operator()(float * a, float32x4_t b){
memcpy(a,&b,4*sizeof(float));
} }
//Double //Double // N:generic
inline void operator()(double * a, float32x4_t b){ inline void operator()(double * a, float64x2_t b){
memcpy(a,&b,2*sizeof(double));
} }
}; };
// Nils: Vset untested; not used currently in Grid at all;
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
struct Vset{ struct Vset{
// Complex float // Complex float // N:ok
inline float32x4_t operator()(Grid::ComplexF *a){ inline float32x4_t operator()(Grid::ComplexF *a){
float32x4_t foo; float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
return foo; return vld1q_f32(tmp);
} }
// Complex double // Complex double // N:ok
inline float32x4_t operator()(Grid::ComplexD *a){ inline float64x2_t operator()(Grid::ComplexD *a){
float32x4_t foo; double tmp[2]={a[0].imag(),a[0].real()};
return foo; return vld1q_f64(tmp);
} }
// Real float // Real float // N:ok
inline float32x4_t operator()(float *a){ inline float32x4_t operator()(float *a){
float32x4_t foo; float tmp[4]={a[3],a[2],a[1],a[0]};
return foo; return vld1q_f32(tmp);
} }
// Real double // Real double // N:ok
inline float32x4_t operator()(double *a){ inline float64x2_t operator()(double *a){
float32x4_t foo; double tmp[2]={a[1],a[0]};
return foo; return vld1q_f64(tmp);
} }
// Integer // Integer // N:ok
inline uint32x4_t operator()(Integer *a){ inline uint32x4_t operator()(Integer *a){
uint32x4_t foo; return vld1q_dup_u32(a);
return foo;
} }
}; };
// N:leaving as is
template <typename Out_type, typename In_type> template <typename Out_type, typename In_type>
struct Reduce{ struct Reduce{
//Need templated class to overload output type //Need templated class to overload output type
//General form must generate error if compiled //General form must generate error if compiled
inline Out_type operator()(In_type in){ inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n"); printf("Error, using wrong Reduce function\n");
exit(1); exit(1);
return 0; return 0;
@ -184,26 +195,98 @@ namespace Optimization {
} }
}; };
struct MultRealPart{
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
float32x4_t re = vtrn1q_f32(a, a);
return vmulq_f32(re, b);
}
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
float64x2_t re = vzip1q_f64(a, a);
return vmulq_f64(re, b);
}
};
struct MaddRealPart{
inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
float32x4_t re = vtrn1q_f32(a, a);
return vfmaq_f32(c, re, b);
}
inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
float64x2_t re = vzip1q_f64(a, a);
return vfmaq_f64(c, re, b);
}
};
struct Div{
// Real float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vdivq_f32(a, b);
}
// Real double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
return vdivq_f64(a, b);
}
};
struct MultComplex{ struct MultComplex{
// Complex float // Complex float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){ inline float32x4_t operator()(float32x4_t a, float32x4_t b){
float32x4_t foo;
return foo; float32x4_t r0, r1, r2, r3, r4;
// a = ar ai Ar Ai
// b = br bi Br Bi
// collect real/imag part, negate bi and Bi
r0 = vtrn1q_f32(b, b); // br br Br Br
r1 = vnegq_f32(b); // -br -bi -Br -Bi
r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi
// the fun part
r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ...
r4 = vrev64q_f32(r3); // -bi*ai bi*ar ...
// fma(a,b,c) = a+b*c
return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ...
// no fma, use mul and add
//float32x4_t r5;
//r5 = vmulq_f32(r0, a);
//return vaddq_f32(r4, r5);
} }
// Complex double // Complex double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){ inline float64x2_t operator()(float64x2_t a, float64x2_t b){
float32x4_t foo;
return foo; float64x2_t r0, r1, r2, r3, r4;
// b = br bi
// collect real/imag part, negate bi
r0 = vtrn1q_f64(b, b); // br br
r1 = vnegq_f64(b); // -br -bi
r2 = vtrn2q_f64(b, r1); // bi -bi
// the fun part
r3 = vmulq_f64(r2, a); // bi*ar -bi*ai
r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar
// fma(a,b,c) = a+b*c
return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi
// no fma, use mul and add
//float64x2_t r5;
//r5 = vmulq_f64(r0, a);
//return vaddq_f64(r4, r5);
} }
}; };
struct Mult{ struct Mult{
// Real float // Real float
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){ inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
return vaddq_f32(vmulq_f32(b,c),a); //return vaddq_f32(vmulq_f32(b,c),a);
return vfmaq_f32(a, b, c);
} }
inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){ inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
return vaddq_f64(vmulq_f64(b,c),a); //return vaddq_f64(vmulq_f64(b,c),a);
return vfmaq_f64(a, b, c);
} }
inline float32x4_t operator()(float32x4_t a, float32x4_t b){ inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vmulq_f32(a,b); return vmulq_f32(a,b);
@ -221,89 +304,275 @@ namespace Optimization {
struct Conj{ struct Conj{
// Complex single // Complex single
inline float32x4_t operator()(float32x4_t in){ inline float32x4_t operator()(float32x4_t in){
return in; // ar ai br bi -> ar -ai br -bi
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
return vtrn1q_f32(in, r1); // ar -ai br -bi
} }
// Complex double // Complex double
//inline float32x4_t operator()(float32x4_t in){ inline float64x2_t operator()(float64x2_t in){
// return 0;
//} float64x2_t r0, r1;
r0 = vextq_f64(in, in, 1); // ai ar
r1 = vnegq_f64(r0); // -ai -ar
return vextq_f64(r0, r1, 1); // ar -ai
}
// do not define for integer input // do not define for integer input
}; };
struct TimesMinusI{ struct TimesMinusI{
//Complex single //Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
return in; // ar ai br bi -> ai -ar ai -br
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
r1 = vrev64q_f32(in); // ai ar bi br
return vtrn1q_f32(r1, r0); // ar -ai br -bi
} }
//Complex double //Complex double
//inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
// return in; // a ib -> b -ia
//} float64x2_t tmp;
tmp = vnegq_f64(in);
return vextq_f64(in, tmp, 1);
}
}; };
struct TimesI{ struct TimesI{
//Complex single //Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
//need shuffle // ar ai br bi -> -ai ar -bi br
return in; float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
return vtrn1q_f32(r1, in); // -ai ar -bi br
} }
//Complex double //Complex double
//inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
// return 0; // a ib -> -b ia
//} float64x2_t tmp;
tmp = vnegq_f64(in);
return vextq_f64(tmp, in, 1);
}
};
struct Permute{
static inline float32x4_t Permute0(float32x4_t in){ // N:ok
// AB CD -> CD AB
return vextq_f32(in, in, 2);
};
static inline float32x4_t Permute1(float32x4_t in){ // N:ok
// AB CD -> BA DC
return vrev64q_f32(in);
};
static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
return in;
};
static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
return in;
};
static inline float64x2_t Permute0(float64x2_t in){ // N:ok
// AB -> BA
return vextq_f64(in, in, 1);
};
static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
return in;
};
static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
return in;
};
static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
return in;
};
};
struct Rotate{
static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
switch(n){
case 0: // AB CD -> AB CD
return tRotate<0>(in);
break;
case 1: // AB CD -> BC DA
return tRotate<1>(in);
break;
case 2: // AB CD -> CD AB
return tRotate<2>(in);
break;
case 3: // AB CD -> DA BC
return tRotate<3>(in);
break;
default: assert(0);
}
}
static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
switch(n){
case 0: // AB -> AB
return tRotate<0>(in);
break;
case 1: // AB -> BA
return tRotate<1>(in);
break;
default: assert(0);
}
}
// working, but no restriction on n
// template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
// template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
// restriction on n
template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
};
struct PrecisionChange {
static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
float16x4_t h = vcvt_f16_f32(a);
return vcvt_high_f16_f32(h, b);
}
static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
sb = vcvt_high_f32_f16(h);
// there is no direct conversion from lower float32x4_t to float64x2_t
// vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
//float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
// workaround for clang
uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
sa = vcvt_high_f32_f16(h1);
}
static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
float32x2_t s = vcvt_f32_f64(a);
return vcvt_high_f32_f64(s, b);
}
static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
b = vcvt_high_f64_f32(s);
// there is no direct conversion from lower float32x4_t to float64x2_t
float32x4_t s1 = vextq_f32(s, s, 2);
a = vcvt_high_f64_f32(s1);
}
static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
float32x4_t s1 = DtoS(a, b);
float32x4_t s2 = DtoS(c, d);
return StoH(s1, s2);
}
static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
float32x4_t s1, s2;
HtoS(h, s1, s2);
StoD(s1, a, b);
StoD(s2, c, d);
}
};
//////////////////////////////////////////////
// Exchange support
struct Exchange{
static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
// in1: ABCD -> out1: ABEF
// in2: EFGH -> out2: CDGH
// z: CDAB
float32x4_t z = vextq_f32(in1, in1, 2);
// out1: ABEF
out1 = vextq_f32(z, in2, 2);
// z: GHEF
z = vextq_f32(in2, in2, 2);
// out2: CDGH
out2 = vextq_f32(in1, z, 2);
};
static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
// in1: ABCD -> out1: AECG
// in2: EFGH -> out2: BFDH
out1 = vtrn1q_f32(in1, in2);
out2 = vtrn2q_f32(in1, in2);
};
static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
assert(0);
return;
};
static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
assert(0);
return;
};
// double precision
static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
// in1: AB -> out1: AC
// in2: CD -> out2: BD
out1 = vzip1q_f64(in1, in2);
out2 = vzip2q_f64(in1, in2);
};
static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
assert(0);
return;
};
static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
assert(0);
return;
};
static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
assert(0);
return;
};
}; };
////////////////////////////////////////////// //////////////////////////////////////////////
// Some Template specialization // Some Template specialization
template < typename vtype >
void permute(vtype &a, vtype b, int perm) {
};
//Complex float Reduce //Complex float Reduce
template<> template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){ inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
return 0; float32x4_t v1; // two complex
v1 = Optimization::Permute::Permute0(in);
v1 = vaddq_f32(v1,in);
u128f conv; conv.v=v1;
return Grid::ComplexF(conv.f[0],conv.f[1]);
} }
//Real float Reduce //Real float Reduce
template<> template<>
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){ inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
float32x2_t high = vget_high_f32(in); return vaddvq_f32(in);
float32x2_t low = vget_low_f32(in);
float32x2_t tmp = vadd_f32(low, high);
float32x2_t sum = vpadd_f32(tmp, tmp);
return vget_lane_f32(sum,0);
} }
//Complex double Reduce //Complex double Reduce
template<> template<> // N:by Boyle
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){ inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
return 0; u128d conv; conv.v = in;
return Grid::ComplexD(conv.f[0],conv.f[1]);
} }
//Real double Reduce //Real double Reduce
template<> template<>
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){ inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
float64x2_t sum = vpaddq_f64(in, in); return vaddvq_f64(in);
return vgetq_lane_f64(sum,0);
} }
//Integer Reduce //Integer Reduce
template<> template<>
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){ inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
// FIXME unimplemented // FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n"); printf("Reduce : Missing integer implementation -> FIX\n");
assert(0); assert(0);
} }
} }
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
// Here assign types // Here assign types
namespace Grid {
// typedef Optimization::vech SIMD_Htype; // Reduced precision type
typedef float16x8_t SIMD_Htype; // Half precision type
typedef float32x4_t SIMD_Ftype; // Single precision type typedef float32x4_t SIMD_Ftype; // Single precision type
typedef float64x2_t SIMD_Dtype; // Double precision type typedef float64x2_t SIMD_Dtype; // Double precision type
typedef uint32x4_t SIMD_Itype; // Integer type typedef uint32x4_t SIMD_Itype; // Integer type
@ -312,13 +581,6 @@ namespace Grid {
inline void prefetch_HINT_T0(const char *ptr){}; inline void prefetch_HINT_T0(const char *ptr){};
// Gpermute function
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
Optimization::permute(y.v,b.v,perm);
}
// Function name aliases // Function name aliases
typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD; typedef Optimization::Vstore VstoreSIMD;
@ -332,8 +594,11 @@ namespace Grid {
// Arithmetic operations // Arithmetic operations
typedef Optimization::Sum SumSIMD; typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD; typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD; typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD; typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD; typedef Optimization::TimesI TimesISIMD;

View File

@ -53,7 +53,7 @@ directory
#if defined IMCI #if defined IMCI
#include "Grid_imci.h" #include "Grid_imci.h"
#endif #endif
#ifdef NEONv8 #ifdef NEONV8
#include "Grid_neon.h" #include "Grid_neon.h"
#endif #endif
#if defined QPX #if defined QPX