mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Updating to modify non-inlining permute routines and hopefully get better reg use and
enhance performance.
This commit is contained in:
parent
5ef42add2d
commit
64d64d1ab6
@ -18,8 +18,8 @@
|
||||
#include <algorithms/iterative/ConjugateGradientMultiShift.h>
|
||||
|
||||
// Lanczos support
|
||||
#include <algorithms/iterative/MatrixUtils.h>
|
||||
#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
||||
//#include <algorithms/iterative/MatrixUtils.h>
|
||||
//#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
||||
|
||||
#include <algorithms/CoarsenedMatrix.h>
|
||||
|
||||
|
@ -178,6 +178,7 @@ void Grid_init(int *argc,char ***argv)
|
||||
}
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
|
||||
QCD::WilsonFermionStatic::HandOptDslash=1;
|
||||
QCD::WilsonFermion5DStatic::HandOptDslash=1;
|
||||
}
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
||||
LebesgueOrder::UseLebesgueOrder=1;
|
||||
|
@ -13,6 +13,11 @@
|
||||
|
||||
typedef uint32_t Integer;
|
||||
|
||||
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
|
||||
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
|
||||
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
|
||||
#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)
|
||||
|
||||
namespace Grid {
|
||||
|
||||
typedef float RealF;
|
||||
|
@ -56,13 +56,13 @@
|
||||
UChi_02+= U_20*Chi_02;\
|
||||
UChi_12+= U_20*Chi_12;
|
||||
|
||||
#define PERMUTE\
|
||||
permute(Chi_00,Chi_00,ptype);\
|
||||
permute(Chi_01,Chi_01,ptype);\
|
||||
permute(Chi_02,Chi_02,ptype);\
|
||||
permute(Chi_10,Chi_10,ptype);\
|
||||
permute(Chi_11,Chi_11,ptype);\
|
||||
permute(Chi_12,Chi_12,ptype);
|
||||
#define PERMUTE_DIR(dir) \
|
||||
permute##dir(Chi_00,Chi_00);\
|
||||
permute##dir(Chi_01,Chi_01);\
|
||||
permute##dir(Chi_02,Chi_02);\
|
||||
permute##dir(Chi_10,Chi_10);\
|
||||
permute##dir(Chi_11,Chi_11);\
|
||||
permute##dir(Chi_12,Chi_12);
|
||||
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
@ -286,6 +286,10 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
// std::cout << "Hand op Dhop "<<std::endl;
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
REGISTER Simd result_00; // 12 regs on knc
|
||||
REGISTER Simd result_01;
|
||||
REGISTER Simd result_02;
|
||||
@ -352,7 +356,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
||||
LOAD_CHIMU;
|
||||
XP_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -373,7 +377,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
||||
LOAD_CHIMU;
|
||||
YP_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -394,7 +398,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
||||
LOAD_CHIMU;
|
||||
ZP_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -414,7 +418,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
||||
LOAD_CHIMU;
|
||||
TP_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -434,7 +438,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
||||
LOAD_CHIMU;
|
||||
XM_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -454,7 +458,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
||||
LOAD_CHIMU;
|
||||
YM_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -474,7 +478,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
||||
LOAD_CHIMU;
|
||||
ZM_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -494,7 +498,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
||||
LOAD_CHIMU;
|
||||
TM_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -526,6 +530,9 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
REGISTER Simd result_00; // 12 regs on knc
|
||||
REGISTER Simd result_01;
|
||||
REGISTER Simd result_02;
|
||||
@ -592,7 +599,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
||||
LOAD_CHIMU;
|
||||
XM_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -612,7 +619,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
||||
LOAD_CHIMU;
|
||||
YM_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -633,7 +640,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
||||
LOAD_CHIMU;
|
||||
ZM_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -653,7 +660,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
||||
LOAD_CHIMU;
|
||||
TM_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -673,7 +680,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
||||
LOAD_CHIMU;
|
||||
XP_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -694,7 +701,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
||||
LOAD_CHIMU;
|
||||
YP_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -714,7 +721,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
||||
LOAD_CHIMU;
|
||||
ZP_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
@ -734,7 +741,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
||||
LOAD_CHIMU;
|
||||
TP_PROJ;
|
||||
if ( perm) {
|
||||
PERMUTE;
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI;
|
||||
|
@ -183,11 +183,11 @@ namespace Optimization {
|
||||
// Complex float
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
__m256 ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
// FIXME AVX2 could MAC
|
||||
ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
return _mm256_addsub_ps(ymm0,ymm1);
|
||||
}
|
||||
@ -270,7 +270,7 @@ namespace Optimization {
|
||||
//Complex single
|
||||
inline __m256 operator()(__m256 in, __m256 ret){
|
||||
__m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i
|
||||
return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
|
||||
return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
|
||||
}
|
||||
//Complex double
|
||||
inline __m256d operator()(__m256d in, __m256d ret){
|
||||
@ -282,7 +282,7 @@ namespace Optimization {
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m256 operator()(__m256 in, __m256 ret){
|
||||
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r
|
||||
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
|
||||
return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r
|
||||
}
|
||||
//Complex double
|
||||
@ -296,27 +296,44 @@ namespace Optimization {
|
||||
// Some Template specialization
|
||||
//////////////////////////////////////////////
|
||||
|
||||
template < typename vtype >
|
||||
void permute(vtype &a,vtype b, int perm) {
|
||||
uconv<vtype> conv;
|
||||
conv.v = b;
|
||||
switch (perm){
|
||||
// 8x32 bits=>3 permutes
|
||||
case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
|
||||
case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
a = conv.v;
|
||||
}
|
||||
struct Permute{
|
||||
|
||||
static inline __m256 Permute0(__m256 in){
|
||||
return _mm256_permute2f128_ps(in,in,0x01);
|
||||
};
|
||||
static inline __m256 Permute1(__m256 in){
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m256 Permute2(__m256 in){
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m256 Permute3(__m256 in){
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline __m256d Permute0(__m256d in){
|
||||
return _mm256_permute2f128_pd(in,in,0x01);
|
||||
};
|
||||
static inline __m256d Permute1(__m256d in){
|
||||
return _mm256_shuffle_pd(in,in,0x5);
|
||||
};
|
||||
static inline __m256d Permute2(__m256d in){
|
||||
return in;
|
||||
};
|
||||
static inline __m256d Permute3(__m256d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
|
||||
__m256 v1,v2;
|
||||
Optimization::permute(v1,in,0); // avx 256; quad complex single
|
||||
v1 = _mm256_add_ps(v1,in);
|
||||
Optimization::permute(v2,v1,1);
|
||||
v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single
|
||||
v1= _mm256_add_ps(v1,in);
|
||||
v2=Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm256_add_ps(v1,v2);
|
||||
u256f conv; conv.v = v1;
|
||||
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||
@ -326,11 +343,11 @@ namespace Optimization {
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
|
||||
__m256 v1,v2;
|
||||
Optimization::permute(v1,in,0); // avx 256; octo-double
|
||||
v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double
|
||||
v1 = _mm256_add_ps(v1,in);
|
||||
Optimization::permute(v2,v1,1);
|
||||
v2 = Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm256_add_ps(v1,v2);
|
||||
Optimization::permute(v2,v1,2);
|
||||
v2 = Optimization::Permute::Permute2(v1);
|
||||
v1 = _mm256_add_ps(v1,v2);
|
||||
u256f conv; conv.v=v1;
|
||||
return conv.f[0];
|
||||
@ -341,7 +358,7 @@ namespace Optimization {
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
|
||||
__m256d v1;
|
||||
Optimization::permute(v1,in,0); // sse 128; paired complex single
|
||||
v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
|
||||
v1 = _mm256_add_pd(v1,in);
|
||||
u256d conv; conv.v = v1;
|
||||
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
||||
@ -351,9 +368,9 @@ namespace Optimization {
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
|
||||
__m256d v1,v2;
|
||||
Optimization::permute(v1,in,0); // avx 256; quad double
|
||||
v1 = Optimization::Permute::Permute0(in); // avx 256; quad double
|
||||
v1 = _mm256_add_pd(v1,in);
|
||||
Optimization::permute(v2,v1,1);
|
||||
v2 = Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm256_add_pd(v1,v2);
|
||||
u256d conv; conv.v = v1;
|
||||
return conv.f[0];
|
||||
@ -387,13 +404,6 @@ namespace Grid {
|
||||
_mm_prefetch(ptr,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template < typename VectorSIMD >
|
||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
||||
Optimization::permute(y.v,b.v,perm);
|
||||
};
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
|
@ -174,7 +174,7 @@ namespace Optimization {
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
__m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
|
||||
__m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
|
||||
a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
|
||||
a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
|
||||
return _mm512_fmaddsub_pd( a_real, b, a_imag );
|
||||
}
|
||||
};
|
||||
@ -211,26 +211,24 @@ namespace Optimization {
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
||||
return _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
|
||||
return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); // 0x4E??
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
||||
return _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
|
||||
}
|
||||
|
||||
|
||||
return _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||
}
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
|
||||
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
|
||||
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
|
||||
}
|
||||
|
||||
@ -239,6 +237,36 @@ namespace Optimization {
|
||||
|
||||
|
||||
|
||||
// Gpermute utilities consider coalescing into 1 Gpermute
|
||||
struct Permute{
|
||||
|
||||
static inline __m512 Permute0(__m512 in){
|
||||
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512 Permute1(__m512 in){
|
||||
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m512 Permute2(__m512 in){
|
||||
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512 Permute3(__m512 in){
|
||||
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
|
||||
static inline __m512d Permute0(__m512d in){
|
||||
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512d Permute1(__m512d in){
|
||||
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m512d Permute2(__m512d in){
|
||||
return _mm512_shuffle_pd(in,in,0x55);
|
||||
};
|
||||
static inline __m512d Permute3(__m512d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@ -298,25 +326,6 @@ namespace Grid {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Gpermute utilities consider coalescing into 1 Gpermute
|
||||
template < typename VectorSIMD >
|
||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
||||
union {
|
||||
__m512 f;
|
||||
decltype(VectorSIMD::v) v;
|
||||
} conv;
|
||||
conv.v = b.v;
|
||||
switch(perm){
|
||||
case 3 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 2 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
|
||||
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
y.v=conv.v;
|
||||
};
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
|
@ -255,7 +255,36 @@ namespace Optimization {
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct Permute{
|
||||
|
||||
static inline __m512 Permute0(__m512 in){
|
||||
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512 Permute1(__m512 in){
|
||||
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m512 Permute2(__m512 in){
|
||||
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
|
||||
};
|
||||
static inline __m512 Permute3(__m512 in){
|
||||
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB);
|
||||
};
|
||||
|
||||
static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
|
||||
return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512d Permute1(__m512d in){
|
||||
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
|
||||
};
|
||||
static inline __m512d Permute2(__m512d in){
|
||||
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
|
||||
};
|
||||
static inline __m512d Permute3(__m512d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@ -315,25 +344,6 @@ namespace Grid {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Gpermute utilities consider coalescing into 1 Gpermute
|
||||
template < typename VectorSIMD >
|
||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
||||
union {
|
||||
__m512 f;
|
||||
decltype(VectorSIMD::v) v;
|
||||
} conv;
|
||||
conv.v = b.v;
|
||||
switch(perm){
|
||||
case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
|
||||
case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break;
|
||||
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
y.v=conv.v;
|
||||
};
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
|
@ -151,10 +151,10 @@ namespace Optimization {
|
||||
// Complex float
|
||||
inline __m128 operator()(__m128 a, __m128 b){
|
||||
__m128 ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
return _mm_addsub_ps(ymm0,ymm1);
|
||||
}
|
||||
@ -201,7 +201,7 @@ namespace Optimization {
|
||||
//Complex single
|
||||
inline __m128 operator()(__m128 in, __m128 ret){
|
||||
__m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
|
||||
return _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
|
||||
return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
}
|
||||
//Complex double
|
||||
inline __m128d operator()(__m128d in, __m128d ret){
|
||||
@ -215,7 +215,7 @@ namespace Optimization {
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m128 operator()(__m128 in, __m128 ret){
|
||||
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1));
|
||||
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
|
||||
}
|
||||
//Complex double
|
||||
@ -225,27 +225,45 @@ namespace Optimization {
|
||||
}
|
||||
};
|
||||
|
||||
struct Permute{
|
||||
|
||||
static inline __m128 Permute0(__m128 in){
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m128 Permute1(__m128 in){
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m128 Permute2(__m128 in){
|
||||
return in;
|
||||
};
|
||||
static inline __m128 Permute3(__m128 in){
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline __m128d Permute0(__m128d in){
|
||||
return _mm_shuffle_pd(in,in,0x1);
|
||||
};
|
||||
static inline __m128d Permute1(__m128d in){
|
||||
return in;
|
||||
};
|
||||
static inline __m128d Permute2(__m128d in){
|
||||
return in;
|
||||
};
|
||||
static inline __m128d Permute3(__m128d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
template < typename vtype >
|
||||
void permute(vtype &a, vtype b, int perm) {
|
||||
uconv<vtype> conv;
|
||||
conv.v = b;
|
||||
switch(perm){
|
||||
case 3: break; //empty for SSE4
|
||||
case 2: break; //empty for SSE4
|
||||
case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
a=conv.v;
|
||||
};
|
||||
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
|
||||
__m128 v1; // two complex
|
||||
Optimization::permute(v1,in,0);
|
||||
v1= Optimization::Permute::Permute0(in);
|
||||
v1= _mm_add_ps(v1,in);
|
||||
u128f conv; conv.v=v1;
|
||||
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||
@ -254,9 +272,9 @@ namespace Optimization {
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
|
||||
__m128 v1,v2; // quad single
|
||||
Optimization::permute(v1,in,0);
|
||||
v1= Optimization::Permute::Permute0(in);
|
||||
v1= _mm_add_ps(v1,in);
|
||||
Optimization::permute(v2,v1,1);
|
||||
v2= Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm_add_ps(v1,v2);
|
||||
u128f conv; conv.v=v1;
|
||||
return conv.f[0];
|
||||
@ -274,7 +292,7 @@ namespace Optimization {
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
|
||||
__m128d v1;
|
||||
Optimization::permute(v1,in,0); // avx 256; quad double
|
||||
v1 = Optimization::Permute::Permute0(in);
|
||||
v1 = _mm_add_pd(v1,in);
|
||||
u128d conv; conv.v = v1;
|
||||
return conv.f[0];
|
||||
@ -302,14 +320,6 @@ namespace Grid {
|
||||
inline void prefetch_HINT_T0(const char *ptr){
|
||||
_mm_prefetch(ptr,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
|
||||
// Gpermute function
|
||||
template < typename VectorSIMD >
|
||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
||||
Optimization::permute(y.v,b.v,perm);
|
||||
}
|
||||
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
|
@ -251,15 +251,30 @@ namespace Grid {
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
// add the vector width as a template param for BG/Q for example
|
||||
////////////////////////////////////////////////////////////////////
|
||||
friend inline void permute0(Grid_simd &y,Grid_simd b){
|
||||
y.v = Optimization::Permute::Permute0(b.v);
|
||||
}
|
||||
friend inline void permute1(Grid_simd &y,Grid_simd b){
|
||||
y.v = Optimization::Permute::Permute1(b.v);
|
||||
}
|
||||
friend inline void permute2(Grid_simd &y,Grid_simd b){
|
||||
y.v = Optimization::Permute::Permute2(b.v);
|
||||
}
|
||||
friend inline void permute3(Grid_simd &y,Grid_simd b){
|
||||
y.v = Optimization::Permute::Permute3(b.v);
|
||||
}
|
||||
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
|
||||
{
|
||||
Gpermute<Grid_simd>(y,b,perm);
|
||||
if (perm==3) permute3(y,b);
|
||||
else if (perm==2) permute2(y,b);
|
||||
else if (perm==1) permute1(y,b);
|
||||
else if (perm==0) permute0(y,b);
|
||||
}
|
||||
|
||||
|
||||
|
||||
};// end of Grid_simd class definition
|
||||
|
||||
|
||||
///////////////////////
|
||||
// Splat
|
||||
///////////////////////
|
||||
|
@ -177,6 +177,7 @@ public:
|
||||
permute(out._internal[i],in._internal[i],permutetype);
|
||||
}
|
||||
}
|
||||
|
||||
// Unary negation
|
||||
friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
|
||||
iVector<vtype,N> ret;
|
||||
@ -290,12 +291,15 @@ public:
|
||||
vstream(out._internal[i][j],in._internal[i][j]);
|
||||
}}
|
||||
}
|
||||
|
||||
friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
|
||||
for(int i=0;i<N;i++){
|
||||
for(int j=0;j<N;j++){
|
||||
permute(out._internal[i][j],in._internal[i][j],permutetype);
|
||||
}}
|
||||
}
|
||||
|
||||
|
||||
// Unary negation
|
||||
friend strong_inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
|
||||
iMatrix<vtype,N> ret;
|
||||
|
@ -35,10 +35,10 @@ icpc-avx-openmp-mpi)
|
||||
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
|
||||
;;
|
||||
icpc-avx-openmp)
|
||||
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
|
||||
CXX=icpc ../../configure --enable-precision=single --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
|
||||
;;
|
||||
icpc-avx2)
|
||||
CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
||||
CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-march=core-avx2 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
||||
;;
|
||||
icpc-avx512)
|
||||
CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3 -std=c++11" --host=none LIBS="-lgmp -lmpfr" --enable-comms=none
|
||||
@ -50,7 +50,7 @@ icpc-mic-avx512)
|
||||
CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3 -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
|
||||
;;
|
||||
clang-sse)
|
||||
CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
||||
CXX=clang++ ../../configure --enable-precision=single --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
||||
;;
|
||||
clang-avx)
|
||||
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
||||
|
Loading…
Reference in New Issue
Block a user