mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
More NEON functionalities
This commit is contained in:
parent
c431816393
commit
ab916d80fd
6
configure
vendored
6
configure
vendored
@ -6712,10 +6712,10 @@ $as_echo "#define AVX512 1" >>confdefs.h
|
||||
|
||||
supported="cross compilation"
|
||||
;;
|
||||
NEONv7)
|
||||
echo Configuring for experimental ARMv7 support
|
||||
NEONv8)
|
||||
echo Configuring for experimental ARMv8a support
|
||||
|
||||
$as_echo "#define NEONv7 1" >>confdefs.h
|
||||
$as_echo "#define NEONv8 1" >>confdefs.h
|
||||
|
||||
supported="cross compilation"
|
||||
;;
|
||||
|
@ -3,7 +3,7 @@
|
||||
#
|
||||
# Project Grid package
|
||||
#
|
||||
# Time-stamp: <2015-06-09 15:26:39 neo>
|
||||
# Time-stamp: <2015-07-10 17:46:21 neo>
|
||||
|
||||
AC_PREREQ([2.63])
|
||||
AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
|
||||
@ -106,9 +106,9 @@ case ${ac_SIMD} in
|
||||
AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] )
|
||||
supported="cross compilation"
|
||||
;;
|
||||
NEONv7)
|
||||
echo Configuring for experimental ARMv7 support
|
||||
AC_DEFINE([NEONv7],[1],[NEON ARMv7 Experimental support ] )
|
||||
NEONv8)
|
||||
echo Configuring for experimental ARMv8a support
|
||||
AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
|
||||
supported="cross compilation"
|
||||
;;
|
||||
DEBUG)
|
||||
|
@ -119,8 +119,8 @@
|
||||
/* Define to 1 if you have the <unistd.h> header file. */
|
||||
#undef HAVE_UNISTD_H
|
||||
|
||||
/* NEON ARMv7 Experimental support */
|
||||
#undef NEONv7
|
||||
/* NEON ARMv8 Experimental support */
|
||||
#undef NEONv8
|
||||
|
||||
/* Name of package */
|
||||
#undef PACKAGE
|
||||
|
@ -7,7 +7,7 @@ namespace Grid{
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Wilson Gauge Action .. should I template the Nc etc..
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
template<class GaugeField,class MatrixField>
|
||||
template<class GaugeField, class MatrixField>
|
||||
class WilsonGaugeAction : public Action<GaugeField> {
|
||||
private:
|
||||
RealD beta;
|
||||
@ -23,7 +23,6 @@ namespace Grid{
|
||||
return beta*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5;
|
||||
};
|
||||
virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
|
||||
|
||||
//not optimal implementation FIXME
|
||||
//extend Ta to include Lorentz indexes
|
||||
RealD factor = 0.5*beta/RealD(Nc);
|
||||
|
@ -7,7 +7,6 @@ namespace QCD {
|
||||
template<class GaugeMat,class GaugeLorentz>
|
||||
class WilsonLoops {
|
||||
public:
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// directed plaquette oriented in mu,nu plane
|
||||
//////////////////////////////////////////////////
|
||||
|
@ -1,14 +1,16 @@
|
||||
//----------------------------------------------------------------------
|
||||
/*! @file Grid_sse4.h
|
||||
@brief Optimization libraries for NEON (ARM) instructions set ARMv7
|
||||
@brief Optimization libraries for NEON (ARM) instructions set ARMv8
|
||||
|
||||
Experimental - Using intrinsics - DEVELOPING!
|
||||
*/
|
||||
// Time-stamp: <2015-06-09 15:25:40 neo>
|
||||
// Time-stamp: <2015-07-10 17:45:09 neo>
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
// ARMv8 supports double precision
|
||||
|
||||
namespace Optimization {
|
||||
|
||||
template<class vtype>
|
||||
@ -22,50 +24,47 @@ namespace Optimization {
|
||||
float f[4];
|
||||
};
|
||||
union u128d {
|
||||
float32x4_t v;
|
||||
float f[4];
|
||||
float64x2_t v;
|
||||
double f[4];
|
||||
};
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
inline float32x4_t operator()(float a, float b){
|
||||
float32x4_t foo;
|
||||
return foo;
|
||||
float tmp[4]={a,b,a,b};
|
||||
return vld1q_f32(tmp);
|
||||
}
|
||||
// Real float
|
||||
inline float32x4_t operator()(float a){
|
||||
float32x4_t foo;
|
||||
return foo;
|
||||
return vld1q_dup_f32(&a);
|
||||
}
|
||||
//Complex double
|
||||
inline float32x4_t operator()(double a, double b){
|
||||
float32x4_t foo;
|
||||
return foo;
|
||||
float tmp[4]={(float)a,(float)b,(float)a,(float)b};
|
||||
return vld1q_f32(tmp);
|
||||
}
|
||||
//Real double
|
||||
inline float32x4_t operator()(double a){
|
||||
float32x4_t foo;
|
||||
return foo;
|
||||
return vld1q_dup_f32(&a);
|
||||
}
|
||||
//Integer
|
||||
inline uint32x4_t operator()(Integer a){
|
||||
uint32x4_t foo;
|
||||
return foo;
|
||||
return vld1q_dup_u32(&a);
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(float32x4_t a, float* F){
|
||||
|
||||
vst1q_f32(F, a);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(float32x4_t a, double* D){
|
||||
|
||||
vst1q_f32((float*)D, a);
|
||||
}
|
||||
//Integer
|
||||
inline void operator()(uint32x4_t a, Integer* I){
|
||||
|
||||
vst1q_u32(I, a);
|
||||
}
|
||||
|
||||
};
|
||||
@ -130,36 +129,30 @@ namespace Optimization {
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
float32x4_t foo;
|
||||
return foo;
|
||||
return vaddq_f32(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
//inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
// float32x4_t foo;
|
||||
// return foo;
|
||||
//}
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||
return vaddq_f64(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
|
||||
uint32x4_t foo;
|
||||
return foo;
|
||||
return vaddq_u32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
float32x4_t foo;
|
||||
return foo;
|
||||
return vsubq_f32(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
//inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
// float32x4_t foo;
|
||||
// return foo;
|
||||
//}
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||
return vsubq_f64(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
|
||||
uint32x4_t foo;
|
||||
return foo;
|
||||
return vsubq_u32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
@ -170,24 +163,24 @@ namespace Optimization {
|
||||
return foo;
|
||||
}
|
||||
// Complex double
|
||||
//inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
// float32x4_t foo;
|
||||
// return foo;
|
||||
//}
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||
float32x4_t foo;
|
||||
return foo;
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
// Real float
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
return a;
|
||||
return vmulq_f32(a,b);
|
||||
}
|
||||
// Real double
|
||||
//inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
// return 0;
|
||||
//}
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||
return vmulq_f64(a,b);
|
||||
}
|
||||
// Integer
|
||||
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
|
||||
return a;
|
||||
return vmulq_u32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
@ -219,6 +212,7 @@ namespace Optimization {
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||
//need shuffle
|
||||
return in;
|
||||
}
|
||||
//Complex double
|
||||
@ -242,20 +236,25 @@ namespace Optimization {
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
|
||||
return 0;
|
||||
float32x2_t high = vget_high_f32(in);
|
||||
float32x2_t low = vget_low_f32(in);
|
||||
float32x2_t tmp = vadd_f32(low, high);
|
||||
float32x2_t sum = vpadd_f32(tmp, tmp);
|
||||
return vget_lane_f32(sum,0);
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, float32x4_t>::operator()(float32x4_t in){
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
|
||||
return 0;
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, float32x4_t>::operator()(float32x4_t in){
|
||||
return 0;
|
||||
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
|
||||
float64x2_t sum = vpaddq_f64(in, in);
|
||||
return vgetq_lane_f64(sum,0);
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
@ -272,7 +271,7 @@ namespace Optimization {
|
||||
namespace Grid {
|
||||
|
||||
typedef float32x4_t SIMD_Ftype; // Single precision type
|
||||
typedef float32x4_t SIMD_Dtype; // Double precision type - no double on ARMv7
|
||||
typedef float64x2_t SIMD_Dtype; // Double precision type
|
||||
typedef uint32x4_t SIMD_Itype; // Integer type
|
||||
|
||||
inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities
|
||||
|
@ -2,7 +2,7 @@
|
||||
/*! @file Grid_vector_types.h
|
||||
@brief Defines templated class Grid_simd to deal with inner vector types
|
||||
*/
|
||||
// Time-stamp: <2015-06-09 15:00:47 neo>
|
||||
// Time-stamp: <2015-07-10 17:45:33 neo>
|
||||
//---------------------------------------------------------------------------
|
||||
#ifndef GRID_VECTOR_TYPES
|
||||
#define GRID_VECTOR_TYPES
|
||||
@ -22,7 +22,7 @@
|
||||
#if defined QPX
|
||||
#include "Grid_qpx.h"
|
||||
#endif
|
||||
#ifdef NEONv7
|
||||
#ifdef NEONv8
|
||||
#include "Grid_neon.h"
|
||||
#endif
|
||||
|
||||
|
3
scripts/arm_configure.experimental_cortex57
Normal file
3
scripts/arm_configure.experimental_cortex57
Normal file
@ -0,0 +1,3 @@
|
||||
#./configure --host=arm-linux-gnueabihf CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/lib/llvm-3.5/lib/clang/3.5.0/include/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a57' --enable-simd=NEONv7
|
||||
|
||||
./configure --host=aarch64-linux-gnu CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target aarch64-linux-gnu -static -I/home/neo/Codes/gmp6.0/gmp-armv8/include/ -L/home/neo/Codes/gmp6.0/gmp-armv8/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-armv8/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-armv8/lib/ -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/4.8.2/aarch64-linux-gnu/' --enable-simd=NEONv7
|
@ -24,13 +24,13 @@ int main (int argc, char ** argv)
|
||||
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
|
||||
GridParallelRNG pRNG(&Fine);
|
||||
pRNG.SeedRandomDevice();
|
||||
LatticeLorentzColourMatrix U(&Fine);
|
||||
LatticeGaugeField U(&Fine);
|
||||
|
||||
SU3::HotConfiguration(pRNG, U);
|
||||
|
||||
|
||||
// simplify template declaration? Strip the lorentz from the second template
|
||||
WilsonGaugeAction<LatticeLorentzColourMatrix, LatticeColourMatrix> Waction(6.0);
|
||||
WilsonGaugeAction<LatticeGaugeField, LatticeColourMatrix> Waction(6.0);
|
||||
|
||||
//Collect actions
|
||||
ActionLevel Level1;
|
||||
|
@ -194,6 +194,31 @@ int main (int argc, char ** argv)
|
||||
// Insist that operations on random scalars gives
|
||||
// identical results to on vectors.
|
||||
|
||||
std::cout << "==================================="<< std::endl;
|
||||
std::cout << "Testing vRealF "<<std::endl;
|
||||
std::cout << "==================================="<< std::endl;
|
||||
|
||||
|
||||
Tester<RealF,vRealF>(funcPlus());
|
||||
Tester<RealF,vRealF>(funcMinus());
|
||||
Tester<RealF,vRealF>(funcTimes());
|
||||
Tester<RealF,vRealF>(funcAdj());
|
||||
Tester<RealF,vRealF>(funcConj());
|
||||
Tester<RealF,vRealF>(funcInnerProduct());
|
||||
ReductionTester<RealF,RealF,vRealF>(funcReduce());
|
||||
|
||||
std::cout << "==================================="<< std::endl;
|
||||
std::cout << "Testing vRealD "<<std::endl;
|
||||
std::cout << "==================================="<< std::endl;
|
||||
|
||||
Tester<RealD,vRealD>(funcPlus());
|
||||
Tester<RealD,vRealD>(funcMinus());
|
||||
Tester<RealD,vRealD>(funcTimes());
|
||||
Tester<RealD,vRealD>(funcAdj());
|
||||
Tester<RealD,vRealD>(funcConj());
|
||||
Tester<RealD,vRealD>(funcInnerProduct());
|
||||
ReductionTester<RealD,RealD,vRealD>(funcReduce());
|
||||
|
||||
std::cout << "==================================="<< std::endl;
|
||||
std::cout << "Testing vComplexF "<<std::endl;
|
||||
std::cout << "==================================="<< std::endl;
|
||||
@ -223,30 +248,6 @@ int main (int argc, char ** argv)
|
||||
Tester<ComplexD,vComplexD>(funcInnerProduct());
|
||||
ReductionTester<ComplexD,ComplexD,vComplexD>(funcReduce());
|
||||
|
||||
std::cout << "==================================="<< std::endl;
|
||||
std::cout << "Testing vRealF "<<std::endl;
|
||||
std::cout << "==================================="<< std::endl;
|
||||
|
||||
|
||||
Tester<RealF,vRealF>(funcPlus());
|
||||
Tester<RealF,vRealF>(funcMinus());
|
||||
Tester<RealF,vRealF>(funcTimes());
|
||||
Tester<RealF,vRealF>(funcAdj());
|
||||
Tester<RealF,vRealF>(funcConj());
|
||||
Tester<RealF,vRealF>(funcInnerProduct());
|
||||
ReductionTester<RealF,RealF,vRealF>(funcReduce());
|
||||
|
||||
std::cout << "==================================="<< std::endl;
|
||||
std::cout << "Testing vRealD "<<std::endl;
|
||||
std::cout << "==================================="<< std::endl;
|
||||
|
||||
Tester<RealD,vRealD>(funcPlus());
|
||||
Tester<RealD,vRealD>(funcMinus());
|
||||
Tester<RealD,vRealD>(funcTimes());
|
||||
Tester<RealD,vRealD>(funcAdj());
|
||||
Tester<RealD,vRealD>(funcConj());
|
||||
Tester<RealD,vRealD>(funcInnerProduct());
|
||||
ReductionTester<RealD,RealD,vRealD>(funcReduce());
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user