mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-25 10:09:34 +01:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/paboyle/Grid into develop
This commit is contained in:
		
							
								
								
									
										14
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								README.md
									
									
									
									
									
								
							| @@ -18,7 +18,7 @@ | |||||||
|  |  | ||||||
| License: GPL v2. | License: GPL v2. | ||||||
|  |  | ||||||
| Last update Nov 2016. | Last update June 2017. | ||||||
|  |  | ||||||
| _Please do not send pull requests to the `master` branch which is reserved for releases._ | _Please do not send pull requests to the `master` branch which is reserved for releases._ | ||||||
|  |  | ||||||
| @@ -78,13 +78,16 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi | |||||||
| for most programmers. | for most programmers. | ||||||
|  |  | ||||||
| The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. | The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. | ||||||
| Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way). | Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported. | ||||||
|  |  | ||||||
| These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers. | These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types.  | ||||||
| The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`. | The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`. | ||||||
|  |  | ||||||
| MPI, OpenMP, and SIMD parallelism are present in the library. | MPI, OpenMP, and SIMD parallelism are present in the library. | ||||||
| Please see https://arxiv.org/abs/1512.03487 for more detail. | Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail. | ||||||
|  |  | ||||||
|  | ### Required libraries | ||||||
|  | Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed. | ||||||
|  |  | ||||||
| ### Quick start | ### Quick start | ||||||
| First, start by cloning the repository: | First, start by cloning the repository: | ||||||
| @@ -173,7 +176,8 @@ The following options can be use with the `--enable-simd=` option to target diff | |||||||
| | `AVXFMA4`   | AVX (256 bit) + FMA4                   | | | `AVXFMA4`   | AVX (256 bit) + FMA4                   | | ||||||
| | `AVX2`      | AVX 2 (256 bit)                        | | | `AVX2`      | AVX 2 (256 bit)                        | | ||||||
| | `AVX512`    | AVX 512 bit                            | | | `AVX512`    | AVX 512 bit                            | | ||||||
| | `QPX`       | QPX (256 bit)                          | | | `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     | | ||||||
|  | | `QPX`       | IBM QPX (256 bit)                      | | ||||||
|  |  | ||||||
| Alternatively, some CPU codenames can be directly used: | Alternatively, some CPU codenames can be directly used: | ||||||
|  |  | ||||||
|   | |||||||
| @@ -249,6 +249,9 @@ case ${ax_cv_cxx_compiler_vendor} in | |||||||
|                            [generic SIMD vector width (in bytes)]) |                            [generic SIMD vector width (in bytes)]) | ||||||
|         SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)" |         SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)" | ||||||
|         SIMD_FLAGS='';; |         SIMD_FLAGS='';; | ||||||
|  |       NEONv8) | ||||||
|  |         AC_DEFINE([NEONV8],[1],[ARMv8 NEON]) | ||||||
|  |         SIMD_FLAGS='-march=armv8-a';; | ||||||
|       QPX|BGQ) |       QPX|BGQ) | ||||||
|         AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) |         AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) | ||||||
|         SIMD_FLAGS='';; |         SIMD_FLAGS='';; | ||||||
|   | |||||||
| @@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real | |||||||
|     if (maxTau - taus < epsilon){ |     if (maxTau - taus < epsilon){ | ||||||
|         epsilon = maxTau-taus; |         epsilon = maxTau-taus; | ||||||
|     } |     } | ||||||
|     std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; |     //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; | ||||||
|     GaugeField Z(U._grid); |     GaugeField Z(U._grid); | ||||||
|     GaugeField Zprime(U._grid); |     GaugeField Zprime(U._grid); | ||||||
|     GaugeField tmp(U._grid), Uprime(U._grid); |     GaugeField tmp(U._grid), Uprime(U._grid); | ||||||
| @@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real | |||||||
|     // adjust integration step |     // adjust integration step | ||||||
|      |      | ||||||
|     taus += epsilon; |     taus += epsilon; | ||||||
|     std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; |     //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; | ||||||
|      |      | ||||||
|     epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.); |     epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.); | ||||||
|     std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; |     //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const { | |||||||
|     out = in; |     out = in; | ||||||
|     for (unsigned int step = 1; step <= Nstep; step++) { |     for (unsigned int step = 1; step <= Nstep; step++) { | ||||||
|         auto start = std::chrono::high_resolution_clock::now(); |         auto start = std::chrono::high_resolution_clock::now(); | ||||||
|         std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl; |  | ||||||
|         evolve_step(out); |         evolve_step(out); | ||||||
|         auto end = std::chrono::high_resolution_clock::now(); |         auto end = std::chrono::high_resolution_clock::now(); | ||||||
|         std::chrono::duration<double> diff = end - start; |         std::chrono::duration<double> diff = end - start; | ||||||
| @@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re | |||||||
|     unsigned int step = 0; |     unsigned int step = 0; | ||||||
|     do{ |     do{ | ||||||
|         step++; |         step++; | ||||||
|         std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; |         //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; | ||||||
|         evolve_step_adaptive(out, maxTau); |         evolve_step_adaptive(out, maxTau); | ||||||
|         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " |         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " | ||||||
|             << step << "  " |             << step << "  " | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
|     /************************************************************************************* | /************************************************************************************* | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid |     Grid physics library, www.github.com/paboyle/Grid | ||||||
|  |  | ||||||
| @@ -6,8 +6,9 @@ | |||||||
|  |  | ||||||
|     Copyright (C) 2015 |     Copyright (C) 2015 | ||||||
|  |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |     Author: Nils Meyer <nils.meyer@ur.de> | ||||||
| Author: neo <cossu@post.kek.jp> |     Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |     Author: neo <cossu@post.kek.jp> | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |     This program is free software; you can redistribute it and/or modify | ||||||
|     it under the terms of the GNU General Public License as published by |     it under the terms of the GNU General Public License as published by | ||||||
| @@ -26,19 +27,25 @@ Author: neo <cossu@post.kek.jp> | |||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| //---------------------------------------------------------------------- |  | ||||||
| /*! @file Grid_sse4.h |  | ||||||
|   @brief Optimization libraries for NEON (ARM) instructions set ARMv8 |  | ||||||
|  |  | ||||||
|   Experimental - Using intrinsics - DEVELOPING!  | /* | ||||||
|  |  | ||||||
|  |   ARMv8 NEON intrinsics layer by | ||||||
|  |  | ||||||
|  |   Nils Meyer <nils.meyer@ur.de>, | ||||||
|  |   University of Regensburg, Germany | ||||||
|  |   SFB/TRR55 | ||||||
|  |  | ||||||
| */ | */ | ||||||
| // Time-stamp: <2015-07-10 17:45:09 neo> |  | ||||||
| //---------------------------------------------------------------------- |  | ||||||
|  |  | ||||||
|  | #ifndef GEN_SIMD_WIDTH | ||||||
|  | #define GEN_SIMD_WIDTH 16u | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #include "Grid_generic_types.h" | ||||||
| #include <arm_neon.h> | #include <arm_neon.h> | ||||||
|  |  | ||||||
| // ARMv8 supports double precision | namespace Grid { | ||||||
|  |  | ||||||
| namespace Optimization { | namespace Optimization { | ||||||
|  |  | ||||||
|   template<class vtype> |   template<class vtype> | ||||||
| @@ -46,14 +53,18 @@ namespace Optimization { | |||||||
|     float32x4_t f; |     float32x4_t f; | ||||||
|     vtype v; |     vtype v; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   union u128f { |   union u128f { | ||||||
|     float32x4_t v; |     float32x4_t v; | ||||||
|     float f[4]; |     float f[4]; | ||||||
|   }; |   }; | ||||||
|   union u128d { |   union u128d { | ||||||
|     float64x2_t v; |     float64x2_t v; | ||||||
|     double f[4]; |     double f[2]; | ||||||
|  |   }; | ||||||
|  |   // half precision | ||||||
|  |   union u128h { | ||||||
|  |     float16x8_t v; | ||||||
|  |     uint16_t f[8]; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   struct Vsplat{ |   struct Vsplat{ | ||||||
| @@ -64,20 +75,20 @@ namespace Optimization { | |||||||
|     } |     } | ||||||
|     // Real float |     // Real float | ||||||
|     inline float32x4_t operator()(float a){ |     inline float32x4_t operator()(float a){ | ||||||
|       return vld1q_dup_f32(&a); |       return vdupq_n_f32(a); | ||||||
|     } |     } | ||||||
|     //Complex double |     //Complex double | ||||||
|     inline float32x4_t operator()(double a, double b){ |     inline float64x2_t operator()(double a, double b){ | ||||||
|       float tmp[4]={(float)a,(float)b,(float)a,(float)b}; |       double tmp[2]={a,b}; | ||||||
|       return vld1q_f32(tmp); |       return vld1q_f64(tmp); | ||||||
|     } |     } | ||||||
|     //Real double |     //Real double // N:tbc | ||||||
|     inline float32x4_t operator()(double a){ |     inline float64x2_t operator()(double a){ | ||||||
|       return vld1q_dup_f32(&a); |       return vdupq_n_f64(a); | ||||||
|     } |     } | ||||||
|     //Integer |     //Integer // N:tbc | ||||||
|     inline uint32x4_t operator()(Integer a){ |     inline uint32x4_t operator()(Integer a){ | ||||||
|       return vld1q_dup_u32(&a); |       return vdupq_n_u32(a); | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
| @@ -87,8 +98,8 @@ namespace Optimization { | |||||||
|       vst1q_f32(F, a); |       vst1q_f32(F, a); | ||||||
|     } |     } | ||||||
|     //Double |     //Double | ||||||
|     inline void operator()(float32x4_t a, double* D){ |     inline void operator()(float64x2_t a, double* D){ | ||||||
|       vst1q_f32((float*)D, a); |       vst1q_f64(D, a); | ||||||
|     } |     } | ||||||
|     //Integer |     //Integer | ||||||
|     inline void operator()(uint32x4_t a, Integer* I){ |     inline void operator()(uint32x4_t a, Integer* I){ | ||||||
| @@ -97,49 +108,49 @@ namespace Optimization { | |||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   struct Vstream{ |   struct Vstream{ // N:equivalents to _mm_stream_p* in NEON? | ||||||
|     //Float |     //Float // N:generic | ||||||
|     inline void operator()(float * a, float32x4_t b){ |     inline void operator()(float * a, float32x4_t b){ | ||||||
|      |       memcpy(a,&b,4*sizeof(float)); | ||||||
|     } |     } | ||||||
|     //Double |     //Double // N:generic | ||||||
|     inline void operator()(double * a, float32x4_t b){ |     inline void operator()(double * a, float64x2_t b){ | ||||||
|    |       memcpy(a,&b,2*sizeof(double)); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   // Nils: Vset untested; not used currently in Grid at all; | ||||||
|  |   // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b | ||||||
|   struct Vset{ |   struct Vset{ | ||||||
|     // Complex float  |     // Complex float // N:ok | ||||||
|     inline float32x4_t operator()(Grid::ComplexF *a){ |     inline float32x4_t operator()(Grid::ComplexF *a){ | ||||||
|       float32x4_t foo; |       float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; | ||||||
|       return foo; |       return vld1q_f32(tmp); | ||||||
|     } |     } | ||||||
|     // Complex double  |     // Complex double // N:ok | ||||||
|     inline float32x4_t operator()(Grid::ComplexD *a){ |     inline float64x2_t operator()(Grid::ComplexD *a){ | ||||||
|       float32x4_t foo; |       double tmp[2]={a[0].imag(),a[0].real()}; | ||||||
|       return foo; |       return vld1q_f64(tmp); | ||||||
|     } |     } | ||||||
|     // Real float  |     // Real float // N:ok | ||||||
|     inline float32x4_t operator()(float *a){ |     inline float32x4_t operator()(float *a){ | ||||||
|       float32x4_t foo; |       float tmp[4]={a[3],a[2],a[1],a[0]}; | ||||||
|       return foo; |       return vld1q_f32(tmp); | ||||||
|     } |     } | ||||||
|     // Real double |     // Real double // N:ok | ||||||
|     inline float32x4_t operator()(double *a){ |     inline float64x2_t operator()(double *a){ | ||||||
|       float32x4_t foo; |       double tmp[2]={a[1],a[0]}; | ||||||
|       return foo; |       return vld1q_f64(tmp); | ||||||
|     } |     } | ||||||
|     // Integer |     // Integer // N:ok | ||||||
|     inline uint32x4_t operator()(Integer *a){ |     inline uint32x4_t operator()(Integer *a){ | ||||||
|       uint32x4_t foo; |       return vld1q_dup_u32(a); | ||||||
|       return foo; |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   // N:leaving as is | ||||||
|   template <typename Out_type, typename In_type> |   template <typename Out_type, typename In_type> | ||||||
|   struct Reduce{ |   struct Reduce{ | ||||||
|     //Need templated class to overload output type |     //Need templated class to overload output type | ||||||
| @@ -184,26 +195,98 @@ namespace Optimization { | |||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   struct MultRealPart{ | ||||||
|  |     inline float32x4_t operator()(float32x4_t a, float32x4_t b){ | ||||||
|  |       float32x4_t re = vtrn1q_f32(a, a); | ||||||
|  |       return vmulq_f32(re, b); | ||||||
|  |     } | ||||||
|  |     inline float64x2_t operator()(float64x2_t a, float64x2_t b){ | ||||||
|  |       float64x2_t re = vzip1q_f64(a, a); | ||||||
|  |       return vmulq_f64(re, b); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   struct MaddRealPart{ | ||||||
|  |     inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){ | ||||||
|  |       float32x4_t re = vtrn1q_f32(a, a); | ||||||
|  |       return vfmaq_f32(c, re, b); | ||||||
|  |     } | ||||||
|  |     inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){ | ||||||
|  |       float64x2_t re = vzip1q_f64(a, a); | ||||||
|  |       return vfmaq_f64(c, re, b); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   struct Div{ | ||||||
|  |     // Real float | ||||||
|  |     inline float32x4_t operator()(float32x4_t a, float32x4_t b){ | ||||||
|  |       return vdivq_f32(a, b); | ||||||
|  |     } | ||||||
|  |     // Real double | ||||||
|  |     inline float64x2_t operator()(float64x2_t a, float64x2_t b){ | ||||||
|  |       return vdivq_f64(a, b); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|   struct MultComplex{ |   struct MultComplex{ | ||||||
|     // Complex float |     // Complex float | ||||||
|     inline float32x4_t operator()(float32x4_t a, float32x4_t b){ |     inline float32x4_t operator()(float32x4_t a, float32x4_t b){ | ||||||
|       float32x4_t foo; |  | ||||||
|       return foo; |       float32x4_t r0, r1, r2, r3, r4; | ||||||
|  |  | ||||||
|  |       // a = ar ai Ar Ai | ||||||
|  |       // b = br bi Br Bi | ||||||
|  |       // collect real/imag part, negate bi and Bi | ||||||
|  |       r0 = vtrn1q_f32(b, b);       //  br  br  Br  Br | ||||||
|  |       r1 = vnegq_f32(b);           // -br -bi -Br -Bi | ||||||
|  |       r2 = vtrn2q_f32(b, r1);      //  bi -bi  Bi -Bi | ||||||
|  |  | ||||||
|  |       // the fun part | ||||||
|  |       r3 = vmulq_f32(r2, a);       //  bi*ar -bi*ai ... | ||||||
|  |       r4 = vrev64q_f32(r3);        // -bi*ai  bi*ar ... | ||||||
|  |  | ||||||
|  |       // fma(a,b,c) = a+b*c | ||||||
|  |       return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ... | ||||||
|  |  | ||||||
|  |       // no fma, use mul and add | ||||||
|  |       //float32x4_t r5; | ||||||
|  |       //r5 = vmulq_f32(r0, a); | ||||||
|  |       //return vaddq_f32(r4, r5); | ||||||
|     } |     } | ||||||
|     // Complex double |     // Complex double | ||||||
|     inline float64x2_t operator()(float64x2_t a, float64x2_t b){ |     inline float64x2_t operator()(float64x2_t a, float64x2_t b){ | ||||||
|       float32x4_t foo; |  | ||||||
|       return foo; |       float64x2_t r0, r1, r2, r3, r4; | ||||||
|  |  | ||||||
|  |       // b = br bi | ||||||
|  |       // collect real/imag part, negate bi | ||||||
|  |       r0 = vtrn1q_f64(b, b);       //  br  br | ||||||
|  |       r1 = vnegq_f64(b);           // -br -bi | ||||||
|  |       r2 = vtrn2q_f64(b, r1);      //  bi -bi | ||||||
|  |  | ||||||
|  |       // the fun part | ||||||
|  |       r3 = vmulq_f64(r2, a);       //  bi*ar -bi*ai | ||||||
|  |       r4 = vextq_f64(r3,r3,1);     // -bi*ai  bi*ar | ||||||
|  |  | ||||||
|  |       // fma(a,b,c) = a+b*c | ||||||
|  |       return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi | ||||||
|  |  | ||||||
|  |       // no fma, use mul and add | ||||||
|  |       //float64x2_t r5; | ||||||
|  |       //r5 = vmulq_f64(r0, a); | ||||||
|  |       //return vaddq_f64(r4, r5); | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   struct Mult{ |   struct Mult{ | ||||||
|     // Real float |     // Real float | ||||||
|     inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){ |     inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){ | ||||||
|       return vaddq_f32(vmulq_f32(b,c),a); |       //return vaddq_f32(vmulq_f32(b,c),a); | ||||||
|  |       return vfmaq_f32(a, b, c); | ||||||
|     } |     } | ||||||
|     inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){ |     inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){ | ||||||
|       return vaddq_f64(vmulq_f64(b,c),a); |       //return vaddq_f64(vmulq_f64(b,c),a); | ||||||
|  |       return vfmaq_f64(a, b, c); | ||||||
|     } |     } | ||||||
|     inline float32x4_t operator()(float32x4_t a, float32x4_t b){ |     inline float32x4_t operator()(float32x4_t a, float32x4_t b){ | ||||||
|       return vmulq_f32(a,b); |       return vmulq_f32(a,b); | ||||||
| @@ -221,74 +304,259 @@ namespace Optimization { | |||||||
|   struct Conj{ |   struct Conj{ | ||||||
|     // Complex single |     // Complex single | ||||||
|     inline float32x4_t operator()(float32x4_t in){ |     inline float32x4_t operator()(float32x4_t in){ | ||||||
|       return in; |       // ar ai br bi -> ar -ai br -bi | ||||||
|  |       float32x4_t r0, r1; | ||||||
|  |       r0 = vnegq_f32(in);        // -ar -ai -br -bi | ||||||
|  |       r1 = vrev64q_f32(r0);      // -ai -ar -bi -br | ||||||
|  |       return vtrn1q_f32(in, r1); //  ar -ai  br -bi | ||||||
|     } |     } | ||||||
|     // Complex double |     // Complex double | ||||||
|     //inline float32x4_t operator()(float32x4_t in){ |     inline float64x2_t operator()(float64x2_t in){ | ||||||
|     // return 0; |  | ||||||
|     //} |       float64x2_t r0, r1; | ||||||
|  |       r0 = vextq_f64(in, in, 1);    //  ai  ar | ||||||
|  |       r1 = vnegq_f64(r0);           // -ai -ar | ||||||
|  |       return vextq_f64(r0, r1, 1);  //  ar -ai | ||||||
|  |     } | ||||||
|     // do not define for integer input |     // do not define for integer input | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   struct TimesMinusI{ |   struct TimesMinusI{ | ||||||
|     //Complex single |     //Complex single | ||||||
|     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ |     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ | ||||||
|       return in; |       // ar ai br bi -> ai -ar ai -br | ||||||
|  |       float32x4_t r0, r1; | ||||||
|  |       r0 = vnegq_f32(in);        // -ar -ai -br -bi | ||||||
|  |       r1 = vrev64q_f32(in);      //  ai  ar  bi  br | ||||||
|  |       return vtrn1q_f32(r1, r0); //  ar -ai  br -bi | ||||||
|     } |     } | ||||||
|     //Complex double |     //Complex double | ||||||
|     //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ |     inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ | ||||||
|     //  return in; |       // a ib -> b -ia | ||||||
|     //} |       float64x2_t tmp; | ||||||
|  |       tmp = vnegq_f64(in); | ||||||
|  |       return vextq_f64(in, tmp, 1); | ||||||
|  |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   struct TimesI{ |   struct TimesI{ | ||||||
|     //Complex single |     //Complex single | ||||||
|     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ |     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ | ||||||
|       //need shuffle |       // ar ai br bi -> -ai ar -bi br | ||||||
|       return in; |       float32x4_t r0, r1; | ||||||
|  |       r0 = vnegq_f32(in);        // -ar -ai -br -bi | ||||||
|  |       r1 = vrev64q_f32(r0);      // -ai -ar -bi -br | ||||||
|  |       return vtrn1q_f32(r1, in); // -ai  ar -bi  br | ||||||
|     } |     } | ||||||
|     //Complex double |     //Complex double | ||||||
|     //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ |     inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ | ||||||
|     //  return 0; |       // a ib -> -b ia | ||||||
|     //} |       float64x2_t tmp; | ||||||
|  |       tmp = vnegq_f64(in); | ||||||
|  |       return vextq_f64(tmp, in, 1); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   struct Permute{ | ||||||
|  |  | ||||||
|  |     static inline float32x4_t Permute0(float32x4_t in){ // N:ok | ||||||
|  |       // AB CD -> CD AB | ||||||
|  |       return vextq_f32(in, in, 2); | ||||||
|  |     }; | ||||||
|  |     static inline float32x4_t Permute1(float32x4_t in){ // N:ok | ||||||
|  |       // AB CD -> BA DC | ||||||
|  |       return vrev64q_f32(in); | ||||||
|  |     }; | ||||||
|  |     static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle | ||||||
|  |       return in; | ||||||
|  |     }; | ||||||
|  |     static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle | ||||||
|  |       return in; | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     static inline float64x2_t Permute0(float64x2_t in){ // N:ok | ||||||
|  |       // AB -> BA | ||||||
|  |       return vextq_f64(in, in, 1); | ||||||
|  |     }; | ||||||
|  |     static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle | ||||||
|  |       return in; | ||||||
|  |     }; | ||||||
|  |     static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle | ||||||
|  |       return in; | ||||||
|  |     }; | ||||||
|  |     static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle | ||||||
|  |       return in; | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   struct Rotate{ | ||||||
|  |  | ||||||
|  |     static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok | ||||||
|  |       switch(n){ | ||||||
|  |       case 0: // AB CD -> AB CD | ||||||
|  |         return tRotate<0>(in); | ||||||
|  |         break; | ||||||
|  |       case 1: // AB CD -> BC DA | ||||||
|  |         return tRotate<1>(in); | ||||||
|  |         break; | ||||||
|  |       case 2: // AB CD -> CD AB | ||||||
|  |         return tRotate<2>(in); | ||||||
|  |         break; | ||||||
|  |       case 3: // AB CD -> DA BC | ||||||
|  |         return tRotate<3>(in); | ||||||
|  |         break; | ||||||
|  |       default: assert(0); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok | ||||||
|  |       switch(n){ | ||||||
|  |       case 0: // AB -> AB | ||||||
|  |         return tRotate<0>(in); | ||||||
|  |         break; | ||||||
|  |       case 1: // AB -> BA | ||||||
|  |         return tRotate<1>(in); | ||||||
|  |         break; | ||||||
|  |       default: assert(0); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  | // working, but no restriction on n | ||||||
|  | //    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); }; | ||||||
|  | //    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); }; | ||||||
|  |  | ||||||
|  | // restriction on n | ||||||
|  |     template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; | ||||||
|  |     template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; | ||||||
|  |  | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   struct PrecisionChange { | ||||||
|  |  | ||||||
|  |     static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) { | ||||||
|  |       float16x4_t h = vcvt_f16_f32(a); | ||||||
|  |       return vcvt_high_f16_f32(h, b); | ||||||
|  |     } | ||||||
|  |     static inline void  HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) { | ||||||
|  |       sb = vcvt_high_f32_f16(h); | ||||||
|  |       // there is no direct conversion from lower float32x4_t to float64x2_t | ||||||
|  |       // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang | ||||||
|  |       //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang | ||||||
|  |       // workaround for clang | ||||||
|  |       uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h); | ||||||
|  |       float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2)); | ||||||
|  |       sa = vcvt_high_f32_f16(h1); | ||||||
|  |     } | ||||||
|  |     static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) { | ||||||
|  |       float32x2_t s = vcvt_f32_f64(a); | ||||||
|  |       return vcvt_high_f32_f64(s, b); | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |     static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) { | ||||||
|  |       b = vcvt_high_f64_f32(s); | ||||||
|  |       // there is no direct conversion from lower float32x4_t to float64x2_t | ||||||
|  |       float32x4_t s1 = vextq_f32(s, s, 2); | ||||||
|  |       a = vcvt_high_f64_f32(s1); | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |     static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) { | ||||||
|  |       float32x4_t s1 = DtoS(a, b); | ||||||
|  |       float32x4_t s2 = DtoS(c, d); | ||||||
|  |       return StoH(s1, s2); | ||||||
|  |     } | ||||||
|  |     static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) { | ||||||
|  |       float32x4_t s1, s2; | ||||||
|  |       HtoS(h, s1, s2); | ||||||
|  |       StoD(s1, a, b); | ||||||
|  |       StoD(s2, c, d); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////// | ||||||
|  |   // Exchange support | ||||||
|  |  | ||||||
|  |   struct Exchange{ | ||||||
|  |     static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ | ||||||
|  |       // in1: ABCD -> out1: ABEF | ||||||
|  |       // in2: EFGH -> out2: CDGH | ||||||
|  |  | ||||||
|  |       // z: CDAB | ||||||
|  |       float32x4_t z = vextq_f32(in1, in1, 2); | ||||||
|  |       // out1: ABEF | ||||||
|  |       out1 = vextq_f32(z, in2, 2); | ||||||
|  |  | ||||||
|  |       // z: GHEF | ||||||
|  |       z = vextq_f32(in2, in2, 2); | ||||||
|  |       // out2: CDGH | ||||||
|  |       out2 = vextq_f32(in1, z, 2); | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ | ||||||
|  |       // in1: ABCD -> out1: AECG | ||||||
|  |       // in2: EFGH -> out2: BFDH | ||||||
|  |       out1 = vtrn1q_f32(in1, in2); | ||||||
|  |       out2 = vtrn2q_f32(in1, in2); | ||||||
|  |     }; | ||||||
|  |     static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ | ||||||
|  |       assert(0); | ||||||
|  |       return; | ||||||
|  |     }; | ||||||
|  |     static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ | ||||||
|  |       assert(0); | ||||||
|  |       return; | ||||||
|  |     }; | ||||||
|  |     // double precision | ||||||
|  |     static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ | ||||||
|  |       // in1: AB -> out1: AC | ||||||
|  |       // in2: CD -> out2: BD | ||||||
|  |       out1 = vzip1q_f64(in1, in2); | ||||||
|  |       out2 = vzip2q_f64(in1, in2); | ||||||
|  |     }; | ||||||
|  |     static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ | ||||||
|  |       assert(0); | ||||||
|  |       return; | ||||||
|  |     }; | ||||||
|  |     static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ | ||||||
|  |       assert(0); | ||||||
|  |       return; | ||||||
|  |     }; | ||||||
|  |     static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ | ||||||
|  |       assert(0); | ||||||
|  |       return; | ||||||
|  |     }; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////// |   ////////////////////////////////////////////// | ||||||
|   // Some Template specialization |   // Some Template specialization | ||||||
|   template < typename vtype >  |  | ||||||
|     void permute(vtype &a, vtype b, int perm) { |  | ||||||
|  |  | ||||||
|   };  |  | ||||||
|  |  | ||||||
|   //Complex float Reduce |   //Complex float Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){ |   inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){ | ||||||
|     return 0; |     float32x4_t v1; // two complex | ||||||
|  |     v1 = Optimization::Permute::Permute0(in); | ||||||
|  |     v1 = vaddq_f32(v1,in); | ||||||
|  |     u128f conv;    conv.v=v1; | ||||||
|  |     return Grid::ComplexF(conv.f[0],conv.f[1]); | ||||||
|   } |   } | ||||||
|   //Real float Reduce |   //Real float Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){ |   inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){ | ||||||
|     float32x2_t high = vget_high_f32(in); |     return vaddvq_f32(in); | ||||||
|     float32x2_t low = vget_low_f32(in); |  | ||||||
|     float32x2_t tmp = vadd_f32(low, high); |  | ||||||
|     float32x2_t sum = vpadd_f32(tmp, tmp); |  | ||||||
|     return vget_lane_f32(sum,0); |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   //Complex double Reduce |   //Complex double Reduce | ||||||
|   template<> |   template<> // N:by Boyle | ||||||
|   inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){ |   inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){ | ||||||
|     return 0; |     u128d conv; conv.v = in; | ||||||
|  |     return Grid::ComplexD(conv.f[0],conv.f[1]); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   //Real double Reduce |   //Real double Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){ |   inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){ | ||||||
|     float64x2_t sum = vpaddq_f64(in, in); |     return vaddvq_f64(in); | ||||||
|     return vgetq_lane_f64(sum,0); |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   //Integer Reduce |   //Integer Reduce | ||||||
| @@ -302,8 +570,9 @@ namespace Optimization { | |||||||
|  |  | ||||||
| ////////////////////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Here assign types | // Here assign types | ||||||
| namespace Grid { |  | ||||||
|  |  | ||||||
|  | // typedef Optimization::vech SIMD_Htype; // Reduced precision type | ||||||
|  |   typedef float16x8_t  SIMD_Htype; // Half precision type | ||||||
|   typedef float32x4_t  SIMD_Ftype; // Single precision type |   typedef float32x4_t  SIMD_Ftype; // Single precision type | ||||||
|   typedef float64x2_t  SIMD_Dtype; // Double precision type |   typedef float64x2_t  SIMD_Dtype; // Double precision type | ||||||
|   typedef uint32x4_t   SIMD_Itype; // Integer type |   typedef uint32x4_t   SIMD_Itype; // Integer type | ||||||
| @@ -312,13 +581,6 @@ namespace Grid { | |||||||
|   inline void prefetch_HINT_T0(const char *ptr){}; |   inline void prefetch_HINT_T0(const char *ptr){}; | ||||||
|  |  | ||||||
|  |  | ||||||
|   // Gpermute function |  | ||||||
|   template < typename VectorSIMD >  |  | ||||||
|     inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { |  | ||||||
|     Optimization::permute(y.v,b.v,perm); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   // Function name aliases |   // Function name aliases | ||||||
|   typedef Optimization::Vsplat   VsplatSIMD; |   typedef Optimization::Vsplat   VsplatSIMD; | ||||||
|   typedef Optimization::Vstore   VstoreSIMD; |   typedef Optimization::Vstore   VstoreSIMD; | ||||||
| @@ -332,8 +594,11 @@ namespace Grid { | |||||||
|   // Arithmetic operations |   // Arithmetic operations | ||||||
|   typedef Optimization::Sum         SumSIMD; |   typedef Optimization::Sum         SumSIMD; | ||||||
|   typedef Optimization::Sub         SubSIMD; |   typedef Optimization::Sub         SubSIMD; | ||||||
|  |   typedef Optimization::Div         DivSIMD; | ||||||
|   typedef Optimization::Mult        MultSIMD; |   typedef Optimization::Mult        MultSIMD; | ||||||
|   typedef Optimization::MultComplex MultComplexSIMD; |   typedef Optimization::MultComplex MultComplexSIMD; | ||||||
|  |   typedef Optimization::MultRealPart MultRealPartSIMD; | ||||||
|  |   typedef Optimization::MaddRealPart MaddRealPartSIMD; | ||||||
|   typedef Optimization::Conj        ConjSIMD; |   typedef Optimization::Conj        ConjSIMD; | ||||||
|   typedef Optimization::TimesMinusI TimesMinusISIMD; |   typedef Optimization::TimesMinusI TimesMinusISIMD; | ||||||
|   typedef Optimization::TimesI      TimesISIMD; |   typedef Optimization::TimesI      TimesISIMD; | ||||||
|   | |||||||
| @@ -53,7 +53,7 @@ directory | |||||||
| #if defined IMCI | #if defined IMCI | ||||||
| #include "Grid_imci.h" | #include "Grid_imci.h" | ||||||
| #endif | #endif | ||||||
| #ifdef NEONv8 | #ifdef NEONV8 | ||||||
| #include "Grid_neon.h" | #include "Grid_neon.h" | ||||||
| #endif | #endif | ||||||
| #if defined QPX | #if defined QPX | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user