diff --git a/README.md b/README.md index 9432abe1..1f0b450c 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ License: GPL v2. -Last update Nov 2016. +Last update June 2017. _Please do not send pull requests to the `master` branch which is reserved for releases._ @@ -78,13 +78,16 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi for most programmers. The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. -Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way). +Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported. -These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers. +These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`. MPI, OpenMP, and SIMD parallelism are present in the library. -Please see https://arxiv.org/abs/1512.03487 for more detail. +Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail. + +### Required libraries +Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed. ### Quick start First, start by cloning the repository: @@ -173,7 +176,8 @@ The following options can be use with the `--enable-simd=` option to target diff | `AVXFMA4` | AVX (256 bit) + FMA4 | | `AVX2` | AVX 2 (256 bit) | | `AVX512` | AVX 512 bit | -| `QPX` | QPX (256 bit) | +| `NEONv8` | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit) | +| `QPX` | IBM QPX (256 bit) | Alternatively, some CPU codenames can be directly used: @@ -212,4 +216,4 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are w --with-mpfr= \ --enable-mkl \ CXX=CC CC=cc -``` \ No newline at end of file +``` diff --git a/configure.ac b/configure.ac index 8175e8b0..8c43d67a 100644 --- a/configure.ac +++ b/configure.ac @@ -249,6 +249,9 @@ case ${ax_cv_cxx_compiler_vendor} in [generic SIMD vector width (in bytes)]) SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)" SIMD_FLAGS='';; + NEONv8) + AC_DEFINE([NEONV8],[1],[ARMv8 NEON]) + SIMD_FLAGS='-march=armv8-a';; QPX|BGQ) AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) SIMD_FLAGS='';; diff --git a/lib/qcd/smearing/WilsonFlow.h b/lib/qcd/smearing/WilsonFlow.h index 5e9f2d95..4f5c0d43 100644 --- a/lib/qcd/smearing/WilsonFlow.h +++ b/lib/qcd/smearing/WilsonFlow.h @@ -108,7 +108,7 @@ void WilsonFlow::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real if (maxTau - taus < epsilon){ epsilon = maxTau-taus; } - std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; + //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; GaugeField Z(U._grid); GaugeField Zprime(U._grid); GaugeField tmp(U._grid), Uprime(U._grid); @@ -138,10 +138,10 @@ void WilsonFlow::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real // adjust integration step taus += epsilon; - std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; + //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.); - std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; + //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; } @@ -166,7 +166,6 @@ void WilsonFlow::smear(GaugeField& out, const GaugeField& in) const { out = in; for (unsigned int step = 1; step <= Nstep; step++) { auto start = std::chrono::high_resolution_clock::now(); - std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl; evolve_step(out); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration diff = end - start; @@ -191,7 +190,7 @@ void WilsonFlow::smear_adaptive(GaugeField& out, const GaugeField& in, Re unsigned int step = 0; do{ step++; - std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; + //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; evolve_step_adaptive(out, maxTau); std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h index 7c1ad443..d6eb9c5a 100644 --- a/lib/simd/Grid_neon.h +++ b/lib/simd/Grid_neon.h @@ -1,13 +1,14 @@ - /************************************************************************************* +/************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/simd/Grid_neon.h Copyright (C) 2015 -Author: Peter Boyle -Author: neo + Author: Nils Meyer + Author: Peter Boyle + Author: neo This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -26,19 +27,25 @@ Author: neo See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -//---------------------------------------------------------------------- -/*! @file Grid_sse4.h - @brief Optimization libraries for NEON (ARM) instructions set ARMv8 - Experimental - Using intrinsics - DEVELOPING! +/* + + ARMv8 NEON intrinsics layer by + + Nils Meyer , + University of Regensburg, Germany + SFB/TRR55 + */ -// Time-stamp: <2015-07-10 17:45:09 neo> -//---------------------------------------------------------------------- +#ifndef GEN_SIMD_WIDTH +#define GEN_SIMD_WIDTH 16u +#endif + +#include "Grid_generic_types.h" #include -// ARMv8 supports double precision - +namespace Grid { namespace Optimization { template @@ -46,16 +53,20 @@ namespace Optimization { float32x4_t f; vtype v; }; - union u128f { float32x4_t v; float f[4]; }; union u128d { float64x2_t v; - double f[4]; + double f[2]; }; - + // half precision + union u128h { + float16x8_t v; + uint16_t f[8]; + }; + struct Vsplat{ //Complex float inline float32x4_t operator()(float a, float b){ @@ -64,31 +75,31 @@ namespace Optimization { } // Real float inline float32x4_t operator()(float a){ - return vld1q_dup_f32(&a); + return vdupq_n_f32(a); } //Complex double - inline float32x4_t operator()(double a, double b){ - float tmp[4]={(float)a,(float)b,(float)a,(float)b}; - return vld1q_f32(tmp); + inline float64x2_t operator()(double a, double b){ + double tmp[2]={a,b}; + return vld1q_f64(tmp); } - //Real double - inline float32x4_t operator()(double a){ - return vld1q_dup_f32(&a); + //Real double // N:tbc + inline float64x2_t operator()(double a){ + return vdupq_n_f64(a); } - //Integer + //Integer // N:tbc inline uint32x4_t operator()(Integer a){ - return vld1q_dup_u32(&a); + return vdupq_n_u32(a); } }; struct Vstore{ - //Float + //Float inline void operator()(float32x4_t a, float* F){ vst1q_f32(F, a); } //Double - inline void operator()(float32x4_t a, double* D){ - vst1q_f32((float*)D, a); + inline void operator()(float64x2_t a, double* D){ + vst1q_f64(D, a); } //Integer inline void operator()(uint32x4_t a, Integer* I){ @@ -97,54 +108,54 @@ namespace Optimization { }; - struct Vstream{ - //Float + struct Vstream{ // N:equivalents to _mm_stream_p* in NEON? + //Float // N:generic inline void operator()(float * a, float32x4_t b){ - + memcpy(a,&b,4*sizeof(float)); } - //Double - inline void operator()(double * a, float32x4_t b){ - + //Double // N:generic + inline void operator()(double * a, float64x2_t b){ + memcpy(a,&b,2*sizeof(double)); } }; + // Nils: Vset untested; not used currently in Grid at all; + // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b struct Vset{ - // Complex float + // Complex float // N:ok inline float32x4_t operator()(Grid::ComplexF *a){ - float32x4_t foo; - return foo; + float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; + return vld1q_f32(tmp); } - // Complex double - inline float32x4_t operator()(Grid::ComplexD *a){ - float32x4_t foo; - return foo; + // Complex double // N:ok + inline float64x2_t operator()(Grid::ComplexD *a){ + double tmp[2]={a[0].imag(),a[0].real()}; + return vld1q_f64(tmp); } - // Real float + // Real float // N:ok inline float32x4_t operator()(float *a){ - float32x4_t foo; - return foo; + float tmp[4]={a[3],a[2],a[1],a[0]}; + return vld1q_f32(tmp); } - // Real double - inline float32x4_t operator()(double *a){ - float32x4_t foo; - return foo; + // Real double // N:ok + inline float64x2_t operator()(double *a){ + double tmp[2]={a[1],a[0]}; + return vld1q_f64(tmp); } - // Integer + // Integer // N:ok inline uint32x4_t operator()(Integer *a){ - uint32x4_t foo; - return foo; + return vld1q_dup_u32(a); } - - }; + // N:leaving as is template struct Reduce{ //Need templated class to overload output type //General form must generate error if compiled - inline Out_type operator()(In_type in){ + inline Out_type operator()(In_type in){ printf("Error, using wrong Reduce function\n"); exit(1); return 0; @@ -184,26 +195,98 @@ namespace Optimization { } }; + struct MultRealPart{ + inline float32x4_t operator()(float32x4_t a, float32x4_t b){ + float32x4_t re = vtrn1q_f32(a, a); + return vmulq_f32(re, b); + } + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + float64x2_t re = vzip1q_f64(a, a); + return vmulq_f64(re, b); + } + }; + + struct MaddRealPart{ + inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){ + float32x4_t re = vtrn1q_f32(a, a); + return vfmaq_f32(c, re, b); + } + inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){ + float64x2_t re = vzip1q_f64(a, a); + return vfmaq_f64(c, re, b); + } + }; + + struct Div{ + // Real float + inline float32x4_t operator()(float32x4_t a, float32x4_t b){ + return vdivq_f32(a, b); + } + // Real double + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + return vdivq_f64(a, b); + } + }; + struct MultComplex{ // Complex float inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - float32x4_t foo; - return foo; + + float32x4_t r0, r1, r2, r3, r4; + + // a = ar ai Ar Ai + // b = br bi Br Bi + // collect real/imag part, negate bi and Bi + r0 = vtrn1q_f32(b, b); // br br Br Br + r1 = vnegq_f32(b); // -br -bi -Br -Bi + r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi + + // the fun part + r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ... + r4 = vrev64q_f32(r3); // -bi*ai bi*ar ... + + // fma(a,b,c) = a+b*c + return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ... + + // no fma, use mul and add + //float32x4_t r5; + //r5 = vmulq_f32(r0, a); + //return vaddq_f32(r4, r5); } // Complex double inline float64x2_t operator()(float64x2_t a, float64x2_t b){ - float32x4_t foo; - return foo; + + float64x2_t r0, r1, r2, r3, r4; + + // b = br bi + // collect real/imag part, negate bi + r0 = vtrn1q_f64(b, b); // br br + r1 = vnegq_f64(b); // -br -bi + r2 = vtrn2q_f64(b, r1); // bi -bi + + // the fun part + r3 = vmulq_f64(r2, a); // bi*ar -bi*ai + r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar + + // fma(a,b,c) = a+b*c + return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi + + // no fma, use mul and add + //float64x2_t r5; + //r5 = vmulq_f64(r0, a); + //return vaddq_f64(r4, r5); } }; struct Mult{ // Real float inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){ - return vaddq_f32(vmulq_f32(b,c),a); + //return vaddq_f32(vmulq_f32(b,c),a); + return vfmaq_f32(a, b, c); } inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){ - return vaddq_f64(vmulq_f64(b,c),a); + //return vaddq_f64(vmulq_f64(b,c),a); + return vfmaq_f64(a, b, c); } inline float32x4_t operator()(float32x4_t a, float32x4_t b){ return vmulq_f32(a,b); @@ -221,89 +304,275 @@ namespace Optimization { struct Conj{ // Complex single inline float32x4_t operator()(float32x4_t in){ - return in; + // ar ai br bi -> ar -ai br -bi + float32x4_t r0, r1; + r0 = vnegq_f32(in); // -ar -ai -br -bi + r1 = vrev64q_f32(r0); // -ai -ar -bi -br + return vtrn1q_f32(in, r1); // ar -ai br -bi } // Complex double - //inline float32x4_t operator()(float32x4_t in){ - // return 0; - //} + inline float64x2_t operator()(float64x2_t in){ + + float64x2_t r0, r1; + r0 = vextq_f64(in, in, 1); // ai ar + r1 = vnegq_f64(r0); // -ai -ar + return vextq_f64(r0, r1, 1); // ar -ai + } // do not define for integer input }; struct TimesMinusI{ //Complex single inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - return in; + // ar ai br bi -> ai -ar ai -br + float32x4_t r0, r1; + r0 = vnegq_f32(in); // -ar -ai -br -bi + r1 = vrev64q_f32(in); // ai ar bi br + return vtrn1q_f32(r1, r0); // ar -ai br -bi } //Complex double - //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - // return in; - //} - - + inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ + // a ib -> b -ia + float64x2_t tmp; + tmp = vnegq_f64(in); + return vextq_f64(in, tmp, 1); + } }; struct TimesI{ //Complex single inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - //need shuffle - return in; + // ar ai br bi -> -ai ar -bi br + float32x4_t r0, r1; + r0 = vnegq_f32(in); // -ar -ai -br -bi + r1 = vrev64q_f32(r0); // -ai -ar -bi -br + return vtrn1q_f32(r1, in); // -ai ar -bi br } //Complex double - //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ - // return 0; - //} + inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ + // a ib -> -b ia + float64x2_t tmp; + tmp = vnegq_f64(in); + return vextq_f64(tmp, in, 1); + } + }; + + struct Permute{ + + static inline float32x4_t Permute0(float32x4_t in){ // N:ok + // AB CD -> CD AB + return vextq_f32(in, in, 2); + }; + static inline float32x4_t Permute1(float32x4_t in){ // N:ok + // AB CD -> BA DC + return vrev64q_f32(in); + }; + static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle + return in; + }; + static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle + return in; + }; + + static inline float64x2_t Permute0(float64x2_t in){ // N:ok + // AB -> BA + return vextq_f64(in, in, 1); + }; + static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle + return in; + }; + static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle + return in; + }; + static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle + return in; + }; + + }; + + struct Rotate{ + + static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok + switch(n){ + case 0: // AB CD -> AB CD + return tRotate<0>(in); + break; + case 1: // AB CD -> BC DA + return tRotate<1>(in); + break; + case 2: // AB CD -> CD AB + return tRotate<2>(in); + break; + case 3: // AB CD -> DA BC + return tRotate<3>(in); + break; + default: assert(0); + } + } + static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok + switch(n){ + case 0: // AB -> AB + return tRotate<0>(in); + break; + case 1: // AB -> BA + return tRotate<1>(in); + break; + default: assert(0); + } + } + +// working, but no restriction on n +// template static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); }; +// template static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); }; + +// restriction on n + template static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; + template static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; + + }; + + struct PrecisionChange { + + static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) { + float16x4_t h = vcvt_f16_f32(a); + return vcvt_high_f16_f32(h, b); + } + static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) { + sb = vcvt_high_f32_f16(h); + // there is no direct conversion from lower float32x4_t to float64x2_t + // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang + //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang + // workaround for clang + uint32x4_t h1u = reinterpret_cast(h); + float16x8_t h1 = reinterpret_cast(vextq_u32(h1u, h1u, 2)); + sa = vcvt_high_f32_f16(h1); + } + static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) { + float32x2_t s = vcvt_f32_f64(a); + return vcvt_high_f32_f64(s, b); + + } + static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) { + b = vcvt_high_f64_f32(s); + // there is no direct conversion from lower float32x4_t to float64x2_t + float32x4_t s1 = vextq_f32(s, s, 2); + a = vcvt_high_f64_f32(s1); + + } + static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) { + float32x4_t s1 = DtoS(a, b); + float32x4_t s2 = DtoS(c, d); + return StoH(s1, s2); + } + static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) { + float32x4_t s1, s2; + HtoS(h, s1, s2); + StoD(s1, a, b); + StoD(s2, c, d); + } + }; + + ////////////////////////////////////////////// + // Exchange support + + struct Exchange{ + static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + // in1: ABCD -> out1: ABEF + // in2: EFGH -> out2: CDGH + + // z: CDAB + float32x4_t z = vextq_f32(in1, in1, 2); + // out1: ABEF + out1 = vextq_f32(z, in2, 2); + + // z: GHEF + z = vextq_f32(in2, in2, 2); + // out2: CDGH + out2 = vextq_f32(in1, z, 2); + }; + + static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + // in1: ABCD -> out1: AECG + // in2: EFGH -> out2: BFDH + out1 = vtrn1q_f32(in1, in2); + out2 = vtrn2q_f32(in1, in2); + }; + static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + assert(0); + return; + }; + static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ + assert(0); + return; + }; + // double precision + static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + // in1: AB -> out1: AC + // in2: CD -> out2: BD + out1 = vzip1q_f64(in1, in2); + out2 = vzip2q_f64(in1, in2); + }; + static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + assert(0); + return; + }; + static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + assert(0); + return; + }; + static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ + assert(0); + return; + }; }; ////////////////////////////////////////////// // Some Template specialization - template < typename vtype > - void permute(vtype &a, vtype b, int perm) { - }; //Complex float Reduce template<> inline Grid::ComplexF Reduce::operator()(float32x4_t in){ - return 0; + float32x4_t v1; // two complex + v1 = Optimization::Permute::Permute0(in); + v1 = vaddq_f32(v1,in); + u128f conv; conv.v=v1; + return Grid::ComplexF(conv.f[0],conv.f[1]); } //Real float Reduce template<> inline Grid::RealF Reduce::operator()(float32x4_t in){ - float32x2_t high = vget_high_f32(in); - float32x2_t low = vget_low_f32(in); - float32x2_t tmp = vadd_f32(low, high); - float32x2_t sum = vpadd_f32(tmp, tmp); - return vget_lane_f32(sum,0); + return vaddvq_f32(in); } - - + + //Complex double Reduce - template<> + template<> // N:by Boyle inline Grid::ComplexD Reduce::operator()(float64x2_t in){ - return 0; + u128d conv; conv.v = in; + return Grid::ComplexD(conv.f[0],conv.f[1]); } - + //Real double Reduce template<> inline Grid::RealD Reduce::operator()(float64x2_t in){ - float64x2_t sum = vpaddq_f64(in, in); - return vgetq_lane_f64(sum,0); + return vaddvq_f64(in); } //Integer Reduce template<> inline Integer Reduce::operator()(uint32x4_t in){ // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); + printf("Reduce : Missing integer implementation -> FIX\n"); assert(0); } } ////////////////////////////////////////////////////////////////////////////////////// -// Here assign types -namespace Grid { +// Here assign types +// typedef Optimization::vech SIMD_Htype; // Reduced precision type + typedef float16x8_t SIMD_Htype; // Half precision type typedef float32x4_t SIMD_Ftype; // Single precision type typedef float64x2_t SIMD_Dtype; // Double precision type typedef uint32x4_t SIMD_Itype; // Integer type @@ -312,13 +581,6 @@ namespace Grid { inline void prefetch_HINT_T0(const char *ptr){}; - // Gpermute function - template < typename VectorSIMD > - inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { - Optimization::permute(y.v,b.v,perm); - } - - // Function name aliases typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vstore VstoreSIMD; @@ -326,16 +588,19 @@ namespace Grid { typedef Optimization::Vstream VstreamSIMD; template using ReduceSIMD = Optimization::Reduce; - + // Arithmetic operations typedef Optimization::Sum SumSIMD; typedef Optimization::Sub SubSIMD; + typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; -} +} \ No newline at end of file diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index e05fecc4..27585547 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -53,7 +53,7 @@ directory #if defined IMCI #include "Grid_imci.h" #endif -#ifdef NEONv8 +#ifdef NEONV8 #include "Grid_neon.h" #endif #if defined QPX