mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	More NEON functionalities
This commit is contained in:
		
							
								
								
									
										6
									
								
								configure
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								configure
									
									
									
									
										vendored
									
									
								
							@@ -6712,10 +6712,10 @@ $as_echo "#define AVX512 1" >>confdefs.h
 | 
			
		||||
 | 
			
		||||
       supported="cross compilation"
 | 
			
		||||
     ;;
 | 
			
		||||
     NEONv7)
 | 
			
		||||
       echo Configuring for experimental ARMv7 support
 | 
			
		||||
     NEONv8)
 | 
			
		||||
       echo Configuring for experimental ARMv8a support
 | 
			
		||||
 | 
			
		||||
$as_echo "#define NEONv7 1" >>confdefs.h
 | 
			
		||||
$as_echo "#define NEONv8 1" >>confdefs.h
 | 
			
		||||
 | 
			
		||||
       supported="cross compilation"
 | 
			
		||||
     ;;
 | 
			
		||||
 
 | 
			
		||||
@@ -3,7 +3,7 @@
 | 
			
		||||
#
 | 
			
		||||
# Project Grid package  
 | 
			
		||||
# 
 | 
			
		||||
# Time-stamp: <2015-06-09 15:26:39 neo>
 | 
			
		||||
# Time-stamp: <2015-07-10 17:46:21 neo>
 | 
			
		||||
 | 
			
		||||
AC_PREREQ([2.63])
 | 
			
		||||
AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
 | 
			
		||||
@@ -106,9 +106,9 @@ case ${ac_SIMD} in
 | 
			
		||||
       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] )
 | 
			
		||||
       supported="cross compilation"
 | 
			
		||||
     ;;
 | 
			
		||||
     NEONv7)
 | 
			
		||||
       echo Configuring for experimental ARMv7 support 
 | 
			
		||||
       AC_DEFINE([NEONv7],[1],[NEON ARMv7 Experimental support ] )
 | 
			
		||||
     NEONv8)
 | 
			
		||||
       echo Configuring for experimental ARMv8a support 
 | 
			
		||||
       AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
 | 
			
		||||
       supported="cross compilation"
 | 
			
		||||
     ;;
 | 
			
		||||
     DEBUG)
 | 
			
		||||
 
 | 
			
		||||
@@ -119,8 +119,8 @@
 | 
			
		||||
/* Define to 1 if you have the <unistd.h> header file. */
 | 
			
		||||
#undef HAVE_UNISTD_H
 | 
			
		||||
 | 
			
		||||
/* NEON ARMv7 Experimental support */
 | 
			
		||||
#undef NEONv7
 | 
			
		||||
/* NEON ARMv8 Experimental support */
 | 
			
		||||
#undef NEONv8
 | 
			
		||||
 | 
			
		||||
/* Name of package */
 | 
			
		||||
#undef PACKAGE
 | 
			
		||||
 
 | 
			
		||||
@@ -7,7 +7,7 @@ namespace Grid{
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Wilson Gauge Action .. should I template the Nc etc..
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    template<class GaugeField,class MatrixField>
 | 
			
		||||
    template<class GaugeField, class MatrixField>
 | 
			
		||||
      class WilsonGaugeAction : public Action<GaugeField> {
 | 
			
		||||
    private:
 | 
			
		||||
      RealD beta;
 | 
			
		||||
@@ -23,7 +23,6 @@ namespace Grid{
 | 
			
		||||
	return beta*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5;
 | 
			
		||||
      };
 | 
			
		||||
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 | 
			
		||||
 | 
			
		||||
	//not optimal implementation FIXME
 | 
			
		||||
	//extend Ta to include Lorentz indexes
 | 
			
		||||
	RealD factor = 0.5*beta/RealD(Nc);
 | 
			
		||||
 
 | 
			
		||||
@@ -7,7 +7,6 @@ namespace QCD {
 | 
			
		||||
template<class GaugeMat,class GaugeLorentz>
 | 
			
		||||
class WilsonLoops {
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // directed plaquette oriented in mu,nu plane
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -1,14 +1,16 @@
 | 
			
		||||
//----------------------------------------------------------------------
 | 
			
		||||
/*! @file Grid_sse4.h
 | 
			
		||||
  @brief Optimization libraries for NEON (ARM) instructions set ARMv7
 | 
			
		||||
  @brief Optimization libraries for NEON (ARM) instructions set ARMv8
 | 
			
		||||
 | 
			
		||||
  Experimental - Using intrinsics - DEVELOPING! 
 | 
			
		||||
*/
 | 
			
		||||
// Time-stamp: <2015-06-09 15:25:40 neo>
 | 
			
		||||
// Time-stamp: <2015-07-10 17:45:09 neo>
 | 
			
		||||
//----------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
#include <arm_neon.h>
 | 
			
		||||
 | 
			
		||||
// ARMv8 supports double precision
 | 
			
		||||
 | 
			
		||||
namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  template<class vtype>
 | 
			
		||||
@@ -22,50 +24,47 @@ namespace Optimization {
 | 
			
		||||
    float f[4];
 | 
			
		||||
  };
 | 
			
		||||
  union u128d {
 | 
			
		||||
    float32x4_t v;
 | 
			
		||||
    float f[4];
 | 
			
		||||
    float64x2_t v;
 | 
			
		||||
    double f[4];
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  struct Vsplat{
 | 
			
		||||
    //Complex float
 | 
			
		||||
    inline float32x4_t operator()(float a, float b){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
      float tmp[4]={a,b,a,b};
 | 
			
		||||
      return vld1q_f32(tmp);
 | 
			
		||||
    }
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline float32x4_t operator()(float a){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
      return vld1q_dup_f32(&a);
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline float32x4_t operator()(double a, double b){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
 | 
			
		||||
      return vld1q_f32(tmp);
 | 
			
		||||
    }
 | 
			
		||||
    //Real double
 | 
			
		||||
    inline float32x4_t operator()(double a){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
      return vld1q_dup_f32(&a);
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline uint32x4_t operator()(Integer a){
 | 
			
		||||
      uint32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
      return vld1q_dup_u32(&a);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Vstore{
 | 
			
		||||
    //Float 
 | 
			
		||||
    inline void operator()(float32x4_t a, float* F){
 | 
			
		||||
      
 | 
			
		||||
      vst1q_f32(F, a);
 | 
			
		||||
    }
 | 
			
		||||
    //Double
 | 
			
		||||
    inline void operator()(float32x4_t a, double* D){
 | 
			
		||||
      
 | 
			
		||||
      vst1q_f32((float*)D, a);
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline void operator()(uint32x4_t a, Integer* I){
 | 
			
		||||
     
 | 
			
		||||
      vst1q_u32(I, a);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
@@ -130,36 +129,30 @@ namespace Optimization {
 | 
			
		||||
  struct Sum{
 | 
			
		||||
    //Complex/Real float
 | 
			
		||||
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
      return vaddq_f32(a,b);
 | 
			
		||||
    }
 | 
			
		||||
    //Complex/Real double
 | 
			
		||||
    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
			
		||||
    //  float32x4_t foo;
 | 
			
		||||
    //  return foo;
 | 
			
		||||
    //}
 | 
			
		||||
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
 | 
			
		||||
      return vaddq_f64(a,b);
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
 | 
			
		||||
      uint32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
      return vaddq_u32(a,b);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Sub{
 | 
			
		||||
    //Complex/Real float
 | 
			
		||||
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
      return vsubq_f32(a,b);
 | 
			
		||||
    }
 | 
			
		||||
    //Complex/Real double
 | 
			
		||||
    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
			
		||||
    //  float32x4_t foo;
 | 
			
		||||
    //  return foo;
 | 
			
		||||
    //}
 | 
			
		||||
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
 | 
			
		||||
      return vsubq_f64(a,b);
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
 | 
			
		||||
      uint32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
      return vsubq_u32(a,b);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
@@ -170,24 +163,24 @@ namespace Optimization {
 | 
			
		||||
      return foo;
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double
 | 
			
		||||
    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
			
		||||
    //  float32x4_t foo;
 | 
			
		||||
    //  return foo;
 | 
			
		||||
    //}
 | 
			
		||||
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Mult{
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
			
		||||
      return a;
 | 
			
		||||
      return vmulq_f32(a,b);
 | 
			
		||||
    }
 | 
			
		||||
    // Real double
 | 
			
		||||
    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
			
		||||
    //  return 0;
 | 
			
		||||
    //}
 | 
			
		||||
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
 | 
			
		||||
      return vmulq_f64(a,b);
 | 
			
		||||
    }
 | 
			
		||||
    // Integer
 | 
			
		||||
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
 | 
			
		||||
      return a;
 | 
			
		||||
      return vmulq_u32(a,b);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
@@ -219,6 +212,7 @@ namespace Optimization {
 | 
			
		||||
  struct TimesI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
 | 
			
		||||
      //need shuffle
 | 
			
		||||
      return in;
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
@@ -242,20 +236,25 @@ namespace Optimization {
 | 
			
		||||
  //Real float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
 | 
			
		||||
    return 0;
 | 
			
		||||
    float32x2_t high = vget_high_f32(in);
 | 
			
		||||
    float32x2_t low = vget_low_f32(in);
 | 
			
		||||
    float32x2_t tmp = vadd_f32(low, high);
 | 
			
		||||
    float32x2_t sum = vpadd_f32(tmp, tmp);
 | 
			
		||||
    return vget_lane_f32(sum,0);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  //Complex double Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::ComplexD Reduce<Grid::ComplexD, float32x4_t>::operator()(float32x4_t in){
 | 
			
		||||
  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
 | 
			
		||||
    return 0;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  //Real double Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::RealD Reduce<Grid::RealD, float32x4_t>::operator()(float32x4_t in){
 | 
			
		||||
    return 0;
 | 
			
		||||
  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
 | 
			
		||||
    float64x2_t sum = vpaddq_f64(in, in);
 | 
			
		||||
    return vgetq_lane_f64(sum,0);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
@@ -272,7 +271,7 @@ namespace Optimization {
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  typedef float32x4_t  SIMD_Ftype; // Single precision type
 | 
			
		||||
  typedef float32x4_t  SIMD_Dtype; // Double precision type - no double on ARMv7
 | 
			
		||||
  typedef float64x2_t  SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef uint32x4_t   SIMD_Itype; // Integer type
 | 
			
		||||
 | 
			
		||||
  inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
 | 
			
		||||
 
 | 
			
		||||
@@ -2,7 +2,7 @@
 | 
			
		||||
/*! @file Grid_vector_types.h
 | 
			
		||||
  @brief Defines templated class Grid_simd to deal with inner vector types
 | 
			
		||||
*/
 | 
			
		||||
// Time-stamp: <2015-06-09 15:00:47 neo>
 | 
			
		||||
// Time-stamp: <2015-07-10 17:45:33 neo>
 | 
			
		||||
//---------------------------------------------------------------------------
 | 
			
		||||
#ifndef GRID_VECTOR_TYPES
 | 
			
		||||
#define GRID_VECTOR_TYPES
 | 
			
		||||
@@ -22,7 +22,7 @@
 | 
			
		||||
#if defined QPX
 | 
			
		||||
#include "Grid_qpx.h"
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef NEONv7
 | 
			
		||||
#ifdef NEONv8
 | 
			
		||||
#include "Grid_neon.h"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										3
									
								
								scripts/arm_configure.experimental_cortex57
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								scripts/arm_configure.experimental_cortex57
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,3 @@
 | 
			
		||||
#./configure --host=arm-linux-gnueabihf  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/lib/llvm-3.5/lib/clang/3.5.0/include/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a57' --enable-simd=NEONv7
 | 
			
		||||
 | 
			
		||||
./configure --host=aarch64-linux-gnu  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target aarch64-linux-gnu -static -I/home/neo/Codes/gmp6.0/gmp-armv8/include/ -L/home/neo/Codes/gmp6.0/gmp-armv8/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-armv8/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-armv8/lib/ -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/4.8.2/aarch64-linux-gnu/' --enable-simd=NEONv7
 | 
			
		||||
@@ -24,13 +24,13 @@ int main (int argc, char ** argv)
 | 
			
		||||
  GridCartesian           Fine(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridParallelRNG  pRNG(&Fine);
 | 
			
		||||
  pRNG.SeedRandomDevice();
 | 
			
		||||
  LatticeLorentzColourMatrix     U(&Fine);
 | 
			
		||||
  LatticeGaugeField U(&Fine);
 | 
			
		||||
 | 
			
		||||
  SU3::HotConfiguration(pRNG, U);
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
  // simplify template declaration? Strip the lorentz from the second template
 | 
			
		||||
  WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(6.0);
 | 
			
		||||
  WilsonGaugeAction<LatticeGaugeField, LatticeColourMatrix> Waction(6.0);
 | 
			
		||||
 | 
			
		||||
  //Collect actions
 | 
			
		||||
  ActionLevel Level1;
 | 
			
		||||
 
 | 
			
		||||
@@ -194,6 +194,31 @@ int main (int argc, char ** argv)
 | 
			
		||||
  // Insist that operations on random scalars gives
 | 
			
		||||
  // identical results to on vectors.
 | 
			
		||||
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout << "Testing vRealF "<<std::endl;
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Tester<RealF,vRealF>(funcPlus());
 | 
			
		||||
  Tester<RealF,vRealF>(funcMinus());
 | 
			
		||||
  Tester<RealF,vRealF>(funcTimes());
 | 
			
		||||
  Tester<RealF,vRealF>(funcAdj());
 | 
			
		||||
  Tester<RealF,vRealF>(funcConj());
 | 
			
		||||
  Tester<RealF,vRealF>(funcInnerProduct());
 | 
			
		||||
  ReductionTester<RealF,RealF,vRealF>(funcReduce());
 | 
			
		||||
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout << "Testing vRealD "<<std::endl;
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
 | 
			
		||||
  Tester<RealD,vRealD>(funcPlus());
 | 
			
		||||
  Tester<RealD,vRealD>(funcMinus());
 | 
			
		||||
  Tester<RealD,vRealD>(funcTimes());
 | 
			
		||||
  Tester<RealD,vRealD>(funcAdj());
 | 
			
		||||
  Tester<RealD,vRealD>(funcConj());
 | 
			
		||||
  Tester<RealD,vRealD>(funcInnerProduct());
 | 
			
		||||
  ReductionTester<RealD,RealD,vRealD>(funcReduce());
 | 
			
		||||
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout << "Testing vComplexF "<<std::endl;
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
@@ -223,30 +248,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
  Tester<ComplexD,vComplexD>(funcInnerProduct());
 | 
			
		||||
  ReductionTester<ComplexD,ComplexD,vComplexD>(funcReduce());
 | 
			
		||||
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout << "Testing vRealF "<<std::endl;
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Tester<RealF,vRealF>(funcPlus());
 | 
			
		||||
  Tester<RealF,vRealF>(funcMinus());
 | 
			
		||||
  Tester<RealF,vRealF>(funcTimes());
 | 
			
		||||
  Tester<RealF,vRealF>(funcAdj());
 | 
			
		||||
  Tester<RealF,vRealF>(funcConj());
 | 
			
		||||
  Tester<RealF,vRealF>(funcInnerProduct());
 | 
			
		||||
  ReductionTester<RealF,RealF,vRealF>(funcReduce());
 | 
			
		||||
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout << "Testing vRealD "<<std::endl;
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
 | 
			
		||||
  Tester<RealD,vRealD>(funcPlus());
 | 
			
		||||
  Tester<RealD,vRealD>(funcMinus());
 | 
			
		||||
  Tester<RealD,vRealD>(funcTimes());
 | 
			
		||||
  Tester<RealD,vRealD>(funcAdj());
 | 
			
		||||
  Tester<RealD,vRealD>(funcConj());
 | 
			
		||||
  Tester<RealD,vRealD>(funcInnerProduct());
 | 
			
		||||
  ReductionTester<RealD,RealD,vRealD>(funcReduce());
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user