From 9adaeb061a39aaea5f0c10545fbcd3207eff4474 Mon Sep 17 00:00:00 2001
From: neo <cossu@post.kek.jp>
Date: Tue, 21 Jul 2015 11:52:15 +0900
Subject: [PATCH] More NEON functionalities

---
 configure                                   |  6 +-
 configure.ac                                |  8 +-
 lib/GridConfig.h.in                         |  4 +-
 lib/qcd/action/gauge/WilsonGaugeAction.h    |  3 +-
 lib/qcd/utils/WilsonLoops.h                 |  1 -
 lib/simd/Grid_neon.h                        | 93 ++++++++++-----------
 lib/simd/Grid_vector_types.h                |  4 +-
 scripts/arm_configure.experimental_cortex57 |  3 +
 tests/Test_hmc_WilsonGauge.cc               |  4 +-
 tests/Test_simd.cc                          | 49 +++++------
 10 files changed, 88 insertions(+), 87 deletions(-)
 create mode 100644 scripts/arm_configure.experimental_cortex57
diff --git a/configure b/configure
index 935f1018..cc11a210 100755
--- a/configure
+++ b/configure
@@ -6712,10 +6712,10 @@ $as_echo "#define AVX512 1" >>confdefs.h
 
        supported="cross compilation"
      ;;
-     NEONv7)
-       echo Configuring for experimental ARMv7 support
+     NEONv8)
+       echo Configuring for experimental ARMv8a support
 
-$as_echo "#define NEONv7 1" >>confdefs.h
+$as_echo "#define NEONv8 1" >>confdefs.h
 
        supported="cross compilation"
      ;;
diff --git a/configure.ac b/configure.ac
index 5ef6fa8d..be01228c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3,7 +3,7 @@
 #
 # Project Grid package  
 # 
-# Time-stamp: <2015-06-09 15:26:39 neo>
+# Time-stamp: <2015-07-10 17:46:21 neo>
 
 AC_PREREQ([2.63])
 AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
@@ -106,9 +106,9 @@ case ${ac_SIMD} in
        AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] )
        supported="cross compilation"
      ;;
-     NEONv7)
-       echo Configuring for experimental ARMv7 support 
-       AC_DEFINE([NEONv7],[1],[NEON ARMv7 Experimental support ] )
+     NEONv8)
+       echo Configuring for experimental ARMv8a support 
+       AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
        supported="cross compilation"
      ;;
      DEBUG)
diff --git a/lib/GridConfig.h.in b/lib/GridConfig.h.in
index 1ec0ecf5..136d7bfb 100644
--- a/lib/GridConfig.h.in
+++ b/lib/GridConfig.h.in
@@ -119,8 +119,8 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H
 
-/* NEON ARMv7 Experimental support */
-#undef NEONv7
+/* NEON ARMv8 Experimental support */
+#undef NEONv8
 
 /* Name of package */
 #undef PACKAGE
diff --git a/lib/qcd/action/gauge/WilsonGaugeAction.h b/lib/qcd/action/gauge/WilsonGaugeAction.h
index aaedf2f9..ad1aecab 100644
--- a/lib/qcd/action/gauge/WilsonGaugeAction.h
+++ b/lib/qcd/action/gauge/WilsonGaugeAction.h
@@ -7,7 +7,7 @@ namespace Grid{
     ////////////////////////////////////////////////////////////////////////
     // Wilson Gauge Action .. should I template the Nc etc..
     ////////////////////////////////////////////////////////////////////////
-    template<class GaugeField,class MatrixField>
+    template<class GaugeField, class MatrixField>
       class WilsonGaugeAction : public Action<GaugeField> {
     private:
       RealD beta;
@@ -23,7 +23,6 @@ namespace Grid{
 	return beta*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5;
       };
       virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-
 	//not optimal implementation FIXME
 	//extend Ta to include Lorentz indexes
 	RealD factor = 0.5*beta/RealD(Nc);
diff --git a/lib/qcd/utils/WilsonLoops.h b/lib/qcd/utils/WilsonLoops.h
index 9b8e8724..4565c8e8 100644
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -7,7 +7,6 @@ namespace QCD {
 template<class GaugeMat,class GaugeLorentz>
 class WilsonLoops {
 public:
-
   //////////////////////////////////////////////////
   // directed plaquette oriented in mu,nu plane
   //////////////////////////////////////////////////
diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h
index 73660b40..3d36ea95 100644
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -1,14 +1,16 @@
 //----------------------------------------------------------------------
 /*! @file Grid_sse4.h
-  @brief Optimization libraries for NEON (ARM) instructions set ARMv7
+  @brief Optimization libraries for NEON (ARM) instructions set ARMv8
 
   Experimental - Using intrinsics - DEVELOPING! 
 */
-// Time-stamp: <2015-06-09 15:25:40 neo>
+// Time-stamp: <2015-07-10 17:45:09 neo>
 //----------------------------------------------------------------------
 
 #include <arm_neon.h>
 
+// ARMv8 supports double precision
+
 namespace Optimization {
 
   template<class vtype>
@@ -22,50 +24,47 @@ namespace Optimization {
     float f[4];
   };
   union u128d {
-    float32x4_t v;
-    float f[4];
+    float64x2_t v;
+    double f[4];
   };
   
   struct Vsplat{
     //Complex float
     inline float32x4_t operator()(float a, float b){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a,b,a,b};
+      return vld1q_f32(tmp);
     }
     // Real float
     inline float32x4_t operator()(float a){
-      float32x4_t foo;
-      return foo;
+      return vld1q_dup_f32(&a);
     }
     //Complex double
     inline float32x4_t operator()(double a, double b){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
+      return vld1q_f32(tmp);
     }
     //Real double
     inline float32x4_t operator()(double a){
-      float32x4_t foo;
-      return foo;
+      return vld1q_dup_f32(&a);
     }
     //Integer
     inline uint32x4_t operator()(Integer a){
-      uint32x4_t foo;
-      return foo;
+      return vld1q_dup_u32(&a);
     }
   };
 
   struct Vstore{
     //Float 
     inline void operator()(float32x4_t a, float* F){
-      
+      vst1q_f32(F, a);
     }
     //Double
     inline void operator()(float32x4_t a, double* D){
-      
+      vst1q_f32((float*)D, a);
     }
     //Integer
     inline void operator()(uint32x4_t a, Integer* I){
-     
+      vst1q_u32(I, a);
     }
 
   };
@@ -130,36 +129,30 @@ namespace Optimization {
   struct Sum{
     //Complex/Real float
     inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+      return vaddq_f32(a,b);
     }
     //Complex/Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  float32x4_t foo;
-    //  return foo;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vaddq_f64(a,b);
+    }
     //Integer
     inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      uint32x4_t foo;
-      return foo;
+      return vaddq_u32(a,b);
     }
   };
 
   struct Sub{
     //Complex/Real float
     inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+      return vsubq_f32(a,b);
     }
     //Complex/Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  float32x4_t foo;
-    //  return foo;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vsubq_f64(a,b);
+    }
     //Integer
     inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      uint32x4_t foo;
-      return foo;
+      return vsubq_u32(a,b);
     }
   };
 
@@ -170,24 +163,24 @@ namespace Optimization {
       return foo;
     }
     // Complex double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  float32x4_t foo;
-    //  return foo;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      float32x4_t foo;
+      return foo;
+    }
   };
 
   struct Mult{
     // Real float
     inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      return a;
+      return vmulq_f32(a,b);
     }
     // Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vmulq_f64(a,b);
+    }
     // Integer
     inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      return a;
+      return vmulq_u32(a,b);
     }
   };
 
@@ -219,6 +212,7 @@ namespace Optimization {
   struct TimesI{
     //Complex single
     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+      //need shuffle
       return in;
     }
     //Complex double
@@ -242,20 +236,25 @@ namespace Optimization {
   //Real float Reduce
   template<>
   inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+    float32x2_t high = vget_high_f32(in);
+    float32x2_t low = vget_low_f32(in);
+    float32x2_t tmp = vadd_f32(low, high);
+    float32x2_t sum = vpadd_f32(tmp, tmp);
+    return vget_lane_f32(sum,0);
   }
   
   
   //Complex double Reduce
   template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, float32x4_t>::operator()(float32x4_t in){
+  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
     return 0;
   }
   
   //Real double Reduce
   template<>
-  inline Grid::RealD Reduce<Grid::RealD, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
+    float64x2_t sum = vpaddq_f64(in, in);
+    return vgetq_lane_f64(sum,0);
   }
 
   //Integer Reduce
@@ -272,7 +271,7 @@ namespace Optimization {
 namespace Grid {
 
   typedef float32x4_t  SIMD_Ftype; // Single precision type
-  typedef float32x4_t  SIMD_Dtype; // Double precision type - no double on ARMv7
+  typedef float64x2_t  SIMD_Dtype; // Double precision type
   typedef uint32x4_t   SIMD_Itype; // Integer type
 
   inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 034d5314..4a0ea33b 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -2,7 +2,7 @@
 /*! @file Grid_vector_types.h
   @brief Defines templated class Grid_simd to deal with inner vector types
 */
-// Time-stamp: <2015-06-09 15:00:47 neo>
+// Time-stamp: <2015-07-10 17:45:33 neo>
 //---------------------------------------------------------------------------
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES
@@ -22,7 +22,7 @@
 #if defined QPX
 #include "Grid_qpx.h"
 #endif
-#ifdef NEONv7
+#ifdef NEONv8
 #include "Grid_neon.h"
 #endif
 
diff --git a/scripts/arm_configure.experimental_cortex57 b/scripts/arm_configure.experimental_cortex57
new file mode 100644
index 00000000..d229763e
--- /dev/null
+++ b/scripts/arm_configure.experimental_cortex57
@@ -0,0 +1,3 @@
+#./configure --host=arm-linux-gnueabihf  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/lib/llvm-3.5/lib/clang/3.5.0/include/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a57' --enable-simd=NEONv7
+
+./configure --host=aarch64-linux-gnu  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target aarch64-linux-gnu -static -I/home/neo/Codes/gmp6.0/gmp-armv8/include/ -L/home/neo/Codes/gmp6.0/gmp-armv8/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-armv8/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-armv8/lib/ -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/4.8.2/aarch64-linux-gnu/' --enable-simd=NEONv7
diff --git a/tests/Test_hmc_WilsonGauge.cc b/tests/Test_hmc_WilsonGauge.cc
index d39ec9b9..7682581d 100644
--- a/tests/Test_hmc_WilsonGauge.cc
+++ b/tests/Test_hmc_WilsonGauge.cc
@@ -24,13 +24,13 @@ int main (int argc, char ** argv)
   GridCartesian           Fine(latt_size,simd_layout,mpi_layout);
   GridParallelRNG  pRNG(&Fine);
   pRNG.SeedRandomDevice();
-  LatticeLorentzColourMatrix     U(&Fine);
+  LatticeGaugeField U(&Fine);
 
   SU3::HotConfiguration(pRNG, U);
  
 
   // simplify template declaration? Strip the lorentz from the second template
-  WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(6.0);
+  WilsonGaugeAction<LatticeGaugeField, LatticeColourMatrix> Waction(6.0);
 
   //Collect actions
   ActionLevel Level1;
diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc
index 800001be..ee50a312 100644
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -194,6 +194,31 @@ int main (int argc, char ** argv)
   // Insist that operations on random scalars gives
   // identical results to on vectors.
 
+  std::cout << "==================================="<<  std::endl;
+  std::cout << "Testing vRealF "<<std::endl;
+  std::cout << "==================================="<<  std::endl;
+
+
+  Tester<RealF,vRealF>(funcPlus());
+  Tester<RealF,vRealF>(funcMinus());
+  Tester<RealF,vRealF>(funcTimes());
+  Tester<RealF,vRealF>(funcAdj());
+  Tester<RealF,vRealF>(funcConj());
+  Tester<RealF,vRealF>(funcInnerProduct());
+  ReductionTester<RealF,RealF,vRealF>(funcReduce());
+
+  std::cout << "==================================="<<  std::endl;
+  std::cout << "Testing vRealD "<<std::endl;
+  std::cout << "==================================="<<  std::endl;
+
+  Tester<RealD,vRealD>(funcPlus());
+  Tester<RealD,vRealD>(funcMinus());
+  Tester<RealD,vRealD>(funcTimes());
+  Tester<RealD,vRealD>(funcAdj());
+  Tester<RealD,vRealD>(funcConj());
+  Tester<RealD,vRealD>(funcInnerProduct());
+  ReductionTester<RealD,RealD,vRealD>(funcReduce());
+
   std::cout << "==================================="<<  std::endl;
   std::cout << "Testing vComplexF "<<std::endl;
   std::cout << "==================================="<<  std::endl;
@@ -223,30 +248,6 @@ int main (int argc, char ** argv)
   Tester<ComplexD,vComplexD>(funcInnerProduct());
   ReductionTester<ComplexD,ComplexD,vComplexD>(funcReduce());
 
-  std::cout << "==================================="<<  std::endl;
-  std::cout << "Testing vRealF "<<std::endl;
-  std::cout << "==================================="<<  std::endl;
-
-
-  Tester<RealF,vRealF>(funcPlus());
-  Tester<RealF,vRealF>(funcMinus());
-  Tester<RealF,vRealF>(funcTimes());
-  Tester<RealF,vRealF>(funcAdj());
-  Tester<RealF,vRealF>(funcConj());
-  Tester<RealF,vRealF>(funcInnerProduct());
-  ReductionTester<RealF,RealF,vRealF>(funcReduce());
-
-  std::cout << "==================================="<<  std::endl;
-  std::cout << "Testing vRealD "<<std::endl;
-  std::cout << "==================================="<<  std::endl;
-
-  Tester<RealD,vRealD>(funcPlus());
-  Tester<RealD,vRealD>(funcMinus());
-  Tester<RealD,vRealD>(funcTimes());
-  Tester<RealD,vRealD>(funcAdj());
-  Tester<RealD,vRealD>(funcConj());
-  Tester<RealD,vRealD>(funcInnerProduct());
-  ReductionTester<RealD,RealD,vRealD>(funcReduce());
 
   Grid_finalize();
 }