More NEON functionalities

2026-03-16 17:26:09 +00:00 · 2015-07-21 11:52:15 +09:00
parent 97afe4125f
commit 9adaeb061a
10 changed files with 88 additions and 87 deletions
--- a/6
+++ b/6
@@ -6712,10 +6712,10 @@ $as_echo "#define AVX512 1" >>confdefs.h

       supported="cross compilation"
     ;;
-     NEONv7)
-       echo Configuring for experimental ARMv7 support
+     NEONv8)
+       echo Configuring for experimental ARMv8a support

-$as_echo "#define NEONv7 1" >>confdefs.h
+$as_echo "#define NEONv8 1" >>confdefs.h

       supported="cross compilation"
     ;;
--- a/configure.ac
+++ b/configure.ac
@@ -3,7 +3,7 @@
 #
 # Project Grid package  
 # 
-# Time-stamp: <2015-06-09 15:26:39 neo>
+# Time-stamp: <2015-07-10 17:46:21 neo>

 AC_PREREQ([2.63])
 AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
@@ -106,9 +106,9 @@ case ${ac_SIMD} in
       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] )
       supported="cross compilation"
     ;;
-     NEONv7)
-       echo Configuring for experimental ARMv7 support 
-       AC_DEFINE([NEONv7],[1],[NEON ARMv7 Experimental support ] )
+     NEONv8)
+       echo Configuring for experimental ARMv8a support 
+       AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
       supported="cross compilation"
     ;;
     DEBUG)
--- a/lib/GridConfig.h.in
+++ b/lib/GridConfig.h.in
@@ -119,8 +119,8 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H

-/* NEON ARMv7 Experimental support */
-#undef NEONv7
+/* NEON ARMv8 Experimental support */
+#undef NEONv8

 /* Name of package */
 #undef PACKAGE
--- a/lib/qcd/action/gauge/WilsonGaugeAction.h
+++ b/lib/qcd/action/gauge/WilsonGaugeAction.h
@@ -7,7 +7,7 @@ namespace Grid{
    ////////////////////////////////////////////////////////////////////////
    // Wilson Gauge Action .. should I template the Nc etc..
    ////////////////////////////////////////////////////////////////////////
-    template<class GaugeField,class MatrixField>
+    template<class GaugeField, class MatrixField>
      class WilsonGaugeAction : public Action<GaugeField> {
    private:
      RealD beta;
@@ -23,7 +23,6 @@ namespace Grid{
 	return beta*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5;
      };
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-
 	//not optimal implementation FIXME
 	//extend Ta to include Lorentz indexes
 	RealD factor = 0.5*beta/RealD(Nc);
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -7,7 +7,6 @@ namespace QCD {
 template<class GaugeMat,class GaugeLorentz>
 class WilsonLoops {
 public:
-
  //////////////////////////////////////////////////
  // directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -1,14 +1,16 @@
 //----------------------------------------------------------------------
 /*! @file Grid_sse4.h
-  @brief Optimization libraries for NEON (ARM) instructions set ARMv7
+  @brief Optimization libraries for NEON (ARM) instructions set ARMv8

  Experimental - Using intrinsics - DEVELOPING! 
 */
-// Time-stamp: <2015-06-09 15:25:40 neo>
+// Time-stamp: <2015-07-10 17:45:09 neo>
 //----------------------------------------------------------------------

 #include <arm_neon.h>

+// ARMv8 supports double precision
+
 namespace Optimization {

  template<class vtype>
@@ -22,50 +24,47 @@ namespace Optimization {
    float f[4];
  };
  union u128d {
-    float32x4_t v;
-    float f[4];
+    float64x2_t v;
+    double f[4];
  };
  
  struct Vsplat{
    //Complex float
    inline float32x4_t operator()(float a, float b){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a,b,a,b};
+      return vld1q_f32(tmp);
    }
    // Real float
    inline float32x4_t operator()(float a){
-      float32x4_t foo;
-      return foo;
+      return vld1q_dup_f32(&a);
    }
    //Complex double
    inline float32x4_t operator()(double a, double b){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
+      return vld1q_f32(tmp);
    }
    //Real double
    inline float32x4_t operator()(double a){
-      float32x4_t foo;
-      return foo;
+      return vld1q_dup_f32(&a);
    }
    //Integer
    inline uint32x4_t operator()(Integer a){
-      uint32x4_t foo;
-      return foo;
+      return vld1q_dup_u32(&a);
    }
  };

  struct Vstore{
    //Float 
    inline void operator()(float32x4_t a, float* F){
-      
+      vst1q_f32(F, a);
    }
    //Double
    inline void operator()(float32x4_t a, double* D){
-      
+      vst1q_f32((float*)D, a);
    }
    //Integer
    inline void operator()(uint32x4_t a, Integer* I){
-     
+      vst1q_u32(I, a);
    }

  };
@@ -130,36 +129,30 @@ namespace Optimization {
  struct Sum{
    //Complex/Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+      return vaddq_f32(a,b);
    }
    //Complex/Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  float32x4_t foo;
-    //  return foo;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vaddq_f64(a,b);
+    }
    //Integer
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      uint32x4_t foo;
-      return foo;
+      return vaddq_u32(a,b);
    }
  };

  struct Sub{
    //Complex/Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+      return vsubq_f32(a,b);
    }
    //Complex/Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  float32x4_t foo;
-    //  return foo;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vsubq_f64(a,b);
+    }
    //Integer
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      uint32x4_t foo;
-      return foo;
+      return vsubq_u32(a,b);
    }
  };

@@ -170,24 +163,24 @@ namespace Optimization {
      return foo;
    }
    // Complex double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  float32x4_t foo;
-    //  return foo;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      float32x4_t foo;
+      return foo;
+    }
  };

  struct Mult{
    // Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      return a;
+      return vmulq_f32(a,b);
    }
    // Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vmulq_f64(a,b);
+    }
    // Integer
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      return a;
+      return vmulq_u32(a,b);
    }
  };

@@ -219,6 +212,7 @@ namespace Optimization {
  struct TimesI{
    //Complex single
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+      //need shuffle
      return in;
    }
    //Complex double
@@ -242,20 +236,25 @@ namespace Optimization {
  //Real float Reduce
  template<>
  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+    float32x2_t high = vget_high_f32(in);
+    float32x2_t low = vget_low_f32(in);
+    float32x2_t tmp = vadd_f32(low, high);
+    float32x2_t sum = vpadd_f32(tmp, tmp);
+    return vget_lane_f32(sum,0);
  }
  
  
  //Complex double Reduce
  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, float32x4_t>::operator()(float32x4_t in){
+  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
    return 0;
  }
  
  //Real double Reduce
  template<>
-  inline Grid::RealD Reduce<Grid::RealD, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
+    float64x2_t sum = vpaddq_f64(in, in);
+    return vgetq_lane_f64(sum,0);
  }

  //Integer Reduce
@@ -272,7 +271,7 @@ namespace Optimization {
 namespace Grid {

  typedef float32x4_t  SIMD_Ftype; // Single precision type
-  typedef float32x4_t  SIMD_Dtype; // Double precision type - no double on ARMv7
+  typedef float64x2_t  SIMD_Dtype; // Double precision type
  typedef uint32x4_t   SIMD_Itype; // Integer type

  inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -2,7 +2,7 @@
 /*! @file Grid_vector_types.h
  @brief Defines templated class Grid_simd to deal with inner vector types
 */
-// Time-stamp: <2015-06-09 15:00:47 neo>
+// Time-stamp: <2015-07-10 17:45:33 neo>
 //---------------------------------------------------------------------------
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES
@@ -22,7 +22,7 @@
 #if defined QPX
 #include "Grid_qpx.h"
 #endif
-#ifdef NEONv7
+#ifdef NEONv8
 #include "Grid_neon.h"
 #endif

--- a/scripts/arm_configure.experimental_cortex57
+++ b/scripts/arm_configure.experimental_cortex57
@@ -0,0 +1,3 @@
+#./configure --host=arm-linux-gnueabihf  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/lib/llvm-3.5/lib/clang/3.5.0/include/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a57' --enable-simd=NEONv7
+
+./configure --host=aarch64-linux-gnu  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target aarch64-linux-gnu -static -I/home/neo/Codes/gmp6.0/gmp-armv8/include/ -L/home/neo/Codes/gmp6.0/gmp-armv8/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-armv8/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-armv8/lib/ -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/4.8.2/aarch64-linux-gnu/' --enable-simd=NEONv7
--- a/tests/Test_hmc_WilsonGauge.cc
+++ b/tests/Test_hmc_WilsonGauge.cc
@@ -24,13 +24,13 @@ int main (int argc, char ** argv)
  GridCartesian           Fine(latt_size,simd_layout,mpi_layout);
  GridParallelRNG  pRNG(&Fine);
  pRNG.SeedRandomDevice();
-  LatticeLorentzColourMatrix     U(&Fine);
+  LatticeGaugeField U(&Fine);

  SU3::HotConfiguration(pRNG, U);
 

  // simplify template declaration? Strip the lorentz from the second template
-  WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(6.0);
+  WilsonGaugeAction<LatticeGaugeField, LatticeColourMatrix> Waction(6.0);

  //Collect actions
  ActionLevel Level1;
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -194,6 +194,31 @@ int main (int argc, char ** argv)
  // Insist that operations on random scalars gives
  // identical results to on vectors.

+  std::cout << "==================================="<<  std::endl;
+  std::cout << "Testing vRealF "<<std::endl;
+  std::cout << "==================================="<<  std::endl;
+
+
+  Tester<RealF,vRealF>(funcPlus());
+  Tester<RealF,vRealF>(funcMinus());
+  Tester<RealF,vRealF>(funcTimes());
+  Tester<RealF,vRealF>(funcAdj());
+  Tester<RealF,vRealF>(funcConj());
+  Tester<RealF,vRealF>(funcInnerProduct());
+  ReductionTester<RealF,RealF,vRealF>(funcReduce());
+
+  std::cout << "==================================="<<  std::endl;
+  std::cout << "Testing vRealD "<<std::endl;
+  std::cout << "==================================="<<  std::endl;
+
+  Tester<RealD,vRealD>(funcPlus());
+  Tester<RealD,vRealD>(funcMinus());
+  Tester<RealD,vRealD>(funcTimes());
+  Tester<RealD,vRealD>(funcAdj());
+  Tester<RealD,vRealD>(funcConj());
+  Tester<RealD,vRealD>(funcInnerProduct());
+  ReductionTester<RealD,RealD,vRealD>(funcReduce());
+
  std::cout << "==================================="<<  std::endl;
  std::cout << "Testing vComplexF "<<std::endl;
  std::cout << "==================================="<<  std::endl;
@@ -223,30 +248,6 @@ int main (int argc, char ** argv)
  Tester<ComplexD,vComplexD>(funcInnerProduct());
  ReductionTester<ComplexD,ComplexD,vComplexD>(funcReduce());

-  std::cout << "==================================="<<  std::endl;
-  std::cout << "Testing vRealF "<<std::endl;
-  std::cout << "==================================="<<  std::endl;
-
-
-  Tester<RealF,vRealF>(funcPlus());
-  Tester<RealF,vRealF>(funcMinus());
-  Tester<RealF,vRealF>(funcTimes());
-  Tester<RealF,vRealF>(funcAdj());
-  Tester<RealF,vRealF>(funcConj());
-  Tester<RealF,vRealF>(funcInnerProduct());
-  ReductionTester<RealF,RealF,vRealF>(funcReduce());
-
-  std::cout << "==================================="<<  std::endl;
-  std::cout << "Testing vRealD "<<std::endl;
-  std::cout << "==================================="<<  std::endl;
-
-  Tester<RealD,vRealD>(funcPlus());
-  Tester<RealD,vRealD>(funcMinus());
-  Tester<RealD,vRealD>(funcTimes());
-  Tester<RealD,vRealD>(funcAdj());
-  Tester<RealD,vRealD>(funcConj());
-  Tester<RealD,vRealD>(funcInnerProduct());
-  ReductionTester<RealD,RealD,vRealD>(funcReduce());

  Grid_finalize();
 }