From 9adaeb061a39aaea5f0c10545fbcd3207eff4474 Mon Sep 17 00:00:00 2001 From: neo Date: Tue, 21 Jul 2015 11:52:15 +0900 Subject: [PATCH] More NEON functionalities --- configure | 6 +- configure.ac | 8 +- lib/GridConfig.h.in | 4 +- lib/qcd/action/gauge/WilsonGaugeAction.h | 3 +- lib/qcd/utils/WilsonLoops.h | 1 - lib/simd/Grid_neon.h | 93 ++++++++++----------- lib/simd/Grid_vector_types.h | 4 +- scripts/arm_configure.experimental_cortex57 | 3 + tests/Test_hmc_WilsonGauge.cc | 4 +- tests/Test_simd.cc | 49 +++++------ 10 files changed, 88 insertions(+), 87 deletions(-) create mode 100644 scripts/arm_configure.experimental_cortex57 diff --git a/configure b/configure index 935f1018..cc11a210 100755 --- a/configure +++ b/configure @@ -6712,10 +6712,10 @@ $as_echo "#define AVX512 1" >>confdefs.h supported="cross compilation" ;; - NEONv7) - echo Configuring for experimental ARMv7 support + NEONv8) + echo Configuring for experimental ARMv8a support -$as_echo "#define NEONv7 1" >>confdefs.h +$as_echo "#define NEONv8 1" >>confdefs.h supported="cross compilation" ;; diff --git a/configure.ac b/configure.ac index 5ef6fa8d..be01228c 100644 --- a/configure.ac +++ b/configure.ac @@ -3,7 +3,7 @@ # # Project Grid package # -# Time-stamp: <2015-06-09 15:26:39 neo> +# Time-stamp: <2015-07-10 17:46:21 neo> AC_PREREQ([2.63]) AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk]) @@ -106,9 +106,9 @@ case ${ac_SIMD} in AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] ) supported="cross compilation" ;; - NEONv7) - echo Configuring for experimental ARMv7 support - AC_DEFINE([NEONv7],[1],[NEON ARMv7 Experimental support ] ) + NEONv8) + echo Configuring for experimental ARMv8a support + AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] ) supported="cross compilation" ;; DEBUG) diff --git a/lib/GridConfig.h.in b/lib/GridConfig.h.in index 1ec0ecf5..136d7bfb 100644 --- a/lib/GridConfig.h.in +++ b/lib/GridConfig.h.in @@ -119,8 +119,8 @@ /* Define to 1 if you have the header file. */ #undef HAVE_UNISTD_H -/* NEON ARMv7 Experimental support */ -#undef NEONv7 +/* NEON ARMv8 Experimental support */ +#undef NEONv8 /* Name of package */ #undef PACKAGE diff --git a/lib/qcd/action/gauge/WilsonGaugeAction.h b/lib/qcd/action/gauge/WilsonGaugeAction.h index aaedf2f9..ad1aecab 100644 --- a/lib/qcd/action/gauge/WilsonGaugeAction.h +++ b/lib/qcd/action/gauge/WilsonGaugeAction.h @@ -7,7 +7,7 @@ namespace Grid{ //////////////////////////////////////////////////////////////////////// // Wilson Gauge Action .. should I template the Nc etc.. //////////////////////////////////////////////////////////////////////// - template + template class WilsonGaugeAction : public Action { private: RealD beta; @@ -23,7 +23,6 @@ namespace Grid{ return beta*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5; }; virtual void deriv(const GaugeField &U,GaugeField & dSdU) { - //not optimal implementation FIXME //extend Ta to include Lorentz indexes RealD factor = 0.5*beta/RealD(Nc); diff --git a/lib/qcd/utils/WilsonLoops.h b/lib/qcd/utils/WilsonLoops.h index 9b8e8724..4565c8e8 100644 --- a/lib/qcd/utils/WilsonLoops.h +++ b/lib/qcd/utils/WilsonLoops.h @@ -7,7 +7,6 @@ namespace QCD { template class WilsonLoops { public: - ////////////////////////////////////////////////// // directed plaquette oriented in mu,nu plane ////////////////////////////////////////////////// diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h index 73660b40..3d36ea95 100644 --- a/lib/simd/Grid_neon.h +++ b/lib/simd/Grid_neon.h @@ -1,14 +1,16 @@ //---------------------------------------------------------------------- /*! @file Grid_sse4.h - @brief Optimization libraries for NEON (ARM) instructions set ARMv7 + @brief Optimization libraries for NEON (ARM) instructions set ARMv8 Experimental - Using intrinsics - DEVELOPING! */ -// Time-stamp: <2015-06-09 15:25:40 neo> +// Time-stamp: <2015-07-10 17:45:09 neo> //---------------------------------------------------------------------- #include +// ARMv8 supports double precision + namespace Optimization { template @@ -22,50 +24,47 @@ namespace Optimization { float f[4]; }; union u128d { - float32x4_t v; - float f[4]; + float64x2_t v; + double f[4]; }; struct Vsplat{ //Complex float inline float32x4_t operator()(float a, float b){ - float32x4_t foo; - return foo; + float tmp[4]={a,b,a,b}; + return vld1q_f32(tmp); } // Real float inline float32x4_t operator()(float a){ - float32x4_t foo; - return foo; + return vld1q_dup_f32(&a); } //Complex double inline float32x4_t operator()(double a, double b){ - float32x4_t foo; - return foo; + float tmp[4]={(float)a,(float)b,(float)a,(float)b}; + return vld1q_f32(tmp); } //Real double inline float32x4_t operator()(double a){ - float32x4_t foo; - return foo; + return vld1q_dup_f32(&a); } //Integer inline uint32x4_t operator()(Integer a){ - uint32x4_t foo; - return foo; + return vld1q_dup_u32(&a); } }; struct Vstore{ //Float inline void operator()(float32x4_t a, float* F){ - + vst1q_f32(F, a); } //Double inline void operator()(float32x4_t a, double* D){ - + vst1q_f32((float*)D, a); } //Integer inline void operator()(uint32x4_t a, Integer* I){ - + vst1q_u32(I, a); } }; @@ -130,36 +129,30 @@ namespace Optimization { struct Sum{ //Complex/Real float inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - float32x4_t foo; - return foo; + return vaddq_f32(a,b); } //Complex/Real double - //inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - // float32x4_t foo; - // return foo; - //} + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + return vaddq_f64(a,b); + } //Integer inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){ - uint32x4_t foo; - return foo; + return vaddq_u32(a,b); } }; struct Sub{ //Complex/Real float inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - float32x4_t foo; - return foo; + return vsubq_f32(a,b); } //Complex/Real double - //inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - // float32x4_t foo; - // return foo; - //} + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + return vsubq_f64(a,b); + } //Integer inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){ - uint32x4_t foo; - return foo; + return vsubq_u32(a,b); } }; @@ -170,24 +163,24 @@ namespace Optimization { return foo; } // Complex double - //inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - // float32x4_t foo; - // return foo; - //} + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + float32x4_t foo; + return foo; + } }; struct Mult{ // Real float inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - return a; + return vmulq_f32(a,b); } // Real double - //inline float32x4_t operator()(float32x4_t a, float32x4_t b){ - // return 0; - //} + inline float64x2_t operator()(float64x2_t a, float64x2_t b){ + return vmulq_f64(a,b); + } // Integer inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){ - return a; + return vmulq_u32(a,b); } }; @@ -219,6 +212,7 @@ namespace Optimization { struct TimesI{ //Complex single inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ + //need shuffle return in; } //Complex double @@ -242,20 +236,25 @@ namespace Optimization { //Real float Reduce template<> inline Grid::RealF Reduce::operator()(float32x4_t in){ - return 0; + float32x2_t high = vget_high_f32(in); + float32x2_t low = vget_low_f32(in); + float32x2_t tmp = vadd_f32(low, high); + float32x2_t sum = vpadd_f32(tmp, tmp); + return vget_lane_f32(sum,0); } //Complex double Reduce template<> - inline Grid::ComplexD Reduce::operator()(float32x4_t in){ + inline Grid::ComplexD Reduce::operator()(float64x2_t in){ return 0; } //Real double Reduce template<> - inline Grid::RealD Reduce::operator()(float32x4_t in){ - return 0; + inline Grid::RealD Reduce::operator()(float64x2_t in){ + float64x2_t sum = vpaddq_f64(in, in); + return vgetq_lane_f64(sum,0); } //Integer Reduce @@ -272,7 +271,7 @@ namespace Optimization { namespace Grid { typedef float32x4_t SIMD_Ftype; // Single precision type - typedef float32x4_t SIMD_Dtype; // Double precision type - no double on ARMv7 + typedef float64x2_t SIMD_Dtype; // Double precision type typedef uint32x4_t SIMD_Itype; // Integer type inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 034d5314..4a0ea33b 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -2,7 +2,7 @@ /*! @file Grid_vector_types.h @brief Defines templated class Grid_simd to deal with inner vector types */ -// Time-stamp: <2015-06-09 15:00:47 neo> +// Time-stamp: <2015-07-10 17:45:33 neo> //--------------------------------------------------------------------------- #ifndef GRID_VECTOR_TYPES #define GRID_VECTOR_TYPES @@ -22,7 +22,7 @@ #if defined QPX #include "Grid_qpx.h" #endif -#ifdef NEONv7 +#ifdef NEONv8 #include "Grid_neon.h" #endif diff --git a/scripts/arm_configure.experimental_cortex57 b/scripts/arm_configure.experimental_cortex57 new file mode 100644 index 00000000..d229763e --- /dev/null +++ b/scripts/arm_configure.experimental_cortex57 @@ -0,0 +1,3 @@ +#./configure --host=arm-linux-gnueabihf CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/lib/llvm-3.5/lib/clang/3.5.0/include/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a57' --enable-simd=NEONv7 + +./configure --host=aarch64-linux-gnu CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target aarch64-linux-gnu -static -I/home/neo/Codes/gmp6.0/gmp-armv8/include/ -L/home/neo/Codes/gmp6.0/gmp-armv8/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-armv8/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-armv8/lib/ -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/4.8.2/aarch64-linux-gnu/' --enable-simd=NEONv7 diff --git a/tests/Test_hmc_WilsonGauge.cc b/tests/Test_hmc_WilsonGauge.cc index d39ec9b9..7682581d 100644 --- a/tests/Test_hmc_WilsonGauge.cc +++ b/tests/Test_hmc_WilsonGauge.cc @@ -24,13 +24,13 @@ int main (int argc, char ** argv) GridCartesian Fine(latt_size,simd_layout,mpi_layout); GridParallelRNG pRNG(&Fine); pRNG.SeedRandomDevice(); - LatticeLorentzColourMatrix U(&Fine); + LatticeGaugeField U(&Fine); SU3::HotConfiguration(pRNG, U); // simplify template declaration? Strip the lorentz from the second template - WilsonGaugeAction Waction(6.0); + WilsonGaugeAction Waction(6.0); //Collect actions ActionLevel Level1; diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 800001be..ee50a312 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -194,6 +194,31 @@ int main (int argc, char ** argv) // Insist that operations on random scalars gives // identical results to on vectors. + std::cout << "==================================="<< std::endl; + std::cout << "Testing vRealF "<(funcPlus()); + Tester(funcMinus()); + Tester(funcTimes()); + Tester(funcAdj()); + Tester(funcConj()); + Tester(funcInnerProduct()); + ReductionTester(funcReduce()); + + std::cout << "==================================="<< std::endl; + std::cout << "Testing vRealD "<(funcPlus()); + Tester(funcMinus()); + Tester(funcTimes()); + Tester(funcAdj()); + Tester(funcConj()); + Tester(funcInnerProduct()); + ReductionTester(funcReduce()); + std::cout << "==================================="<< std::endl; std::cout << "Testing vComplexF "<(funcInnerProduct()); ReductionTester(funcReduce()); - std::cout << "==================================="<< std::endl; - std::cout << "Testing vRealF "<(funcPlus()); - Tester(funcMinus()); - Tester(funcTimes()); - Tester(funcAdj()); - Tester(funcConj()); - Tester(funcInnerProduct()); - ReductionTester(funcReduce()); - - std::cout << "==================================="<< std::endl; - std::cout << "Testing vRealD "<(funcPlus()); - Tester(funcMinus()); - Tester(funcTimes()); - Tester(funcAdj()); - Tester(funcConj()); - Tester(funcInnerProduct()); - ReductionTester(funcReduce()); Grid_finalize(); }