1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Updating to modify non-inlining permute routines and hopefully get better reg use and

enhance performance.
This commit is contained in:
Peter Boyle 2015-09-25 08:55:04 -07:00
parent 5ef42add2d
commit 64d64d1ab6
11 changed files with 212 additions and 141 deletions

View File

@ -18,8 +18,8 @@
#include <algorithms/iterative/ConjugateGradientMultiShift.h> #include <algorithms/iterative/ConjugateGradientMultiShift.h>
// Lanczos support // Lanczos support
#include <algorithms/iterative/MatrixUtils.h> //#include <algorithms/iterative/MatrixUtils.h>
#include <algorithms/iterative/ImplicitlyRestartedLanczos.h> //#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <algorithms/CoarsenedMatrix.h> #include <algorithms/CoarsenedMatrix.h>

View File

@ -178,6 +178,7 @@ void Grid_init(int *argc,char ***argv)
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
QCD::WilsonFermionStatic::HandOptDslash=1; QCD::WilsonFermionStatic::HandOptDslash=1;
QCD::WilsonFermion5DStatic::HandOptDslash=1;
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
LebesgueOrder::UseLebesgueOrder=1; LebesgueOrder::UseLebesgueOrder=1;

View File

@ -13,6 +13,11 @@
typedef uint32_t Integer; typedef uint32_t Integer;
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)
namespace Grid { namespace Grid {
typedef float RealF; typedef float RealF;

View File

@ -56,13 +56,13 @@
UChi_02+= U_20*Chi_02;\ UChi_02+= U_20*Chi_02;\
UChi_12+= U_20*Chi_12; UChi_12+= U_20*Chi_12;
#define PERMUTE\ #define PERMUTE_DIR(dir) \
permute(Chi_00,Chi_00,ptype);\ permute##dir(Chi_00,Chi_00);\
permute(Chi_01,Chi_01,ptype);\ permute##dir(Chi_01,Chi_01);\
permute(Chi_02,Chi_02,ptype);\ permute##dir(Chi_02,Chi_02);\
permute(Chi_10,Chi_10,ptype);\ permute##dir(Chi_10,Chi_10);\
permute(Chi_11,Chi_11,ptype);\ permute##dir(Chi_11,Chi_11);\
permute(Chi_12,Chi_12,ptype); permute##dir(Chi_12,Chi_12);
// hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2)); // hspin(1)=fspin(1)+timesI(fspin(2));
@ -286,6 +286,10 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out) int ss,int sU,const FermionField &in, FermionField &out)
{ {
// std::cout << "Hand op Dhop "<<std::endl;
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
REGISTER Simd result_00; // 12 regs on knc REGISTER Simd result_00; // 12 regs on knc
REGISTER Simd result_01; REGISTER Simd result_01;
REGISTER Simd result_02; REGISTER Simd result_02;
@ -352,7 +356,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
LOAD_CHIMU; LOAD_CHIMU;
XP_PROJ; XP_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -373,7 +377,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
LOAD_CHIMU; LOAD_CHIMU;
YP_PROJ; YP_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -394,7 +398,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
LOAD_CHIMU; LOAD_CHIMU;
ZP_PROJ; ZP_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -414,7 +418,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
LOAD_CHIMU; LOAD_CHIMU;
TP_PROJ; TP_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -434,7 +438,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
LOAD_CHIMU; LOAD_CHIMU;
XM_PROJ; XM_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -454,7 +458,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
LOAD_CHIMU; LOAD_CHIMU;
YM_PROJ; YM_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -474,7 +478,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
LOAD_CHIMU; LOAD_CHIMU;
ZM_PROJ; ZM_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -494,7 +498,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
LOAD_CHIMU; LOAD_CHIMU;
TM_PROJ; TM_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -526,6 +530,9 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out) int ss,int sU,const FermionField &in, FermionField &out)
{ {
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
REGISTER Simd result_00; // 12 regs on knc REGISTER Simd result_00; // 12 regs on knc
REGISTER Simd result_01; REGISTER Simd result_01;
REGISTER Simd result_02; REGISTER Simd result_02;
@ -592,7 +599,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
LOAD_CHIMU; LOAD_CHIMU;
XM_PROJ; XM_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -612,7 +619,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
LOAD_CHIMU; LOAD_CHIMU;
YM_PROJ; YM_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -633,7 +640,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
LOAD_CHIMU; LOAD_CHIMU;
ZM_PROJ; ZM_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -653,7 +660,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
LOAD_CHIMU; LOAD_CHIMU;
TM_PROJ; TM_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -673,7 +680,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
LOAD_CHIMU; LOAD_CHIMU;
XP_PROJ; XP_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -694,7 +701,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
LOAD_CHIMU; LOAD_CHIMU;
YP_PROJ; YP_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -714,7 +721,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
LOAD_CHIMU; LOAD_CHIMU;
ZP_PROJ; ZP_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;
@ -734,7 +741,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
LOAD_CHIMU; LOAD_CHIMU;
TP_PROJ; TP_PROJ;
if ( perm) { if ( perm) {
PERMUTE; PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
} }
} else { } else {
LOAD_CHI; LOAD_CHI;

View File

@ -183,11 +183,11 @@ namespace Optimization {
// Complex float // Complex float
inline __m256 operator()(__m256 a, __m256 b){ inline __m256 operator()(__m256 a, __m256 b){
__m256 ymm0,ymm1,ymm2; __m256 ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar, ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
// FIXME AVX2 could MAC // FIXME AVX2 could MAC
ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm256_addsub_ps(ymm0,ymm1); return _mm256_addsub_ps(ymm0,ymm1);
} }
@ -270,7 +270,7 @@ namespace Optimization {
//Complex single //Complex single
inline __m256 operator()(__m256 in, __m256 ret){ inline __m256 operator()(__m256 in, __m256 ret){
__m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i
return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
} }
//Complex double //Complex double
inline __m256d operator()(__m256d in, __m256d ret){ inline __m256d operator()(__m256d in, __m256d ret){
@ -282,7 +282,7 @@ namespace Optimization {
struct TimesI{ struct TimesI{
//Complex single //Complex single
inline __m256 operator()(__m256 in, __m256 ret){ inline __m256 operator()(__m256 in, __m256 ret){
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r
} }
//Complex double //Complex double
@ -296,27 +296,44 @@ namespace Optimization {
// Some Template specialization // Some Template specialization
////////////////////////////////////////////// //////////////////////////////////////////////
template < typename vtype > struct Permute{
void permute(vtype &a,vtype b, int perm) {
uconv<vtype> conv; static inline __m256 Permute0(__m256 in){
conv.v = b; return _mm256_permute2f128_ps(in,in,0x01);
switch (perm){ };
// 8x32 bits=>3 permutes static inline __m256 Permute1(__m256 in){
case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; };
case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break; static inline __m256 Permute2(__m256 in){
default: assert(0); break; return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
} };
a = conv.v; static inline __m256 Permute3(__m256 in){
} return in;
};
static inline __m256d Permute0(__m256d in){
return _mm256_permute2f128_pd(in,in,0x01);
};
static inline __m256d Permute1(__m256d in){
return _mm256_shuffle_pd(in,in,0x5);
};
static inline __m256d Permute2(__m256d in){
return in;
};
static inline __m256d Permute3(__m256d in){
return in;
};
};
//Complex float Reduce //Complex float Reduce
template<> template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){ inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
__m256 v1,v2; __m256 v1,v2;
Optimization::permute(v1,in,0); // avx 256; quad complex single v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single
v1 = _mm256_add_ps(v1,in); v1= _mm256_add_ps(v1,in);
Optimization::permute(v2,v1,1); v2=Optimization::Permute::Permute1(v1);
v1 = _mm256_add_ps(v1,v2); v1 = _mm256_add_ps(v1,v2);
u256f conv; conv.v = v1; u256f conv; conv.v = v1;
return Grid::ComplexF(conv.f[0],conv.f[1]); return Grid::ComplexF(conv.f[0],conv.f[1]);
@ -326,11 +343,11 @@ namespace Optimization {
template<> template<>
inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){ inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
__m256 v1,v2; __m256 v1,v2;
Optimization::permute(v1,in,0); // avx 256; octo-double v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double
v1 = _mm256_add_ps(v1,in); v1 = _mm256_add_ps(v1,in);
Optimization::permute(v2,v1,1); v2 = Optimization::Permute::Permute1(v1);
v1 = _mm256_add_ps(v1,v2); v1 = _mm256_add_ps(v1,v2);
Optimization::permute(v2,v1,2); v2 = Optimization::Permute::Permute2(v1);
v1 = _mm256_add_ps(v1,v2); v1 = _mm256_add_ps(v1,v2);
u256f conv; conv.v=v1; u256f conv; conv.v=v1;
return conv.f[0]; return conv.f[0];
@ -341,7 +358,7 @@ namespace Optimization {
template<> template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){ inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
__m256d v1; __m256d v1;
Optimization::permute(v1,in,0); // sse 128; paired complex single v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
v1 = _mm256_add_pd(v1,in); v1 = _mm256_add_pd(v1,in);
u256d conv; conv.v = v1; u256d conv; conv.v = v1;
return Grid::ComplexD(conv.f[0],conv.f[1]); return Grid::ComplexD(conv.f[0],conv.f[1]);
@ -351,9 +368,9 @@ namespace Optimization {
template<> template<>
inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){ inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
__m256d v1,v2; __m256d v1,v2;
Optimization::permute(v1,in,0); // avx 256; quad double v1 = Optimization::Permute::Permute0(in); // avx 256; quad double
v1 = _mm256_add_pd(v1,in); v1 = _mm256_add_pd(v1,in);
Optimization::permute(v2,v1,1); v2 = Optimization::Permute::Permute1(v1);
v1 = _mm256_add_pd(v1,v2); v1 = _mm256_add_pd(v1,v2);
u256d conv; conv.v = v1; u256d conv; conv.v = v1;
return conv.f[0]; return conv.f[0];
@ -387,13 +404,6 @@ namespace Grid {
_mm_prefetch(ptr,_MM_HINT_T0); _mm_prefetch(ptr,_MM_HINT_T0);
} }
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
Optimization::permute(y.v,b.v,perm);
};
// Function name aliases // Function name aliases
typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD; typedef Optimization::Vstore VstoreSIMD;

View File

@ -211,26 +211,24 @@ namespace Optimization {
//Complex single //Complex single
inline __m512 operator()(__m512 in, __m512 ret){ inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
return _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)); return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); // 0x4E??
} }
//Complex double //Complex double
inline __m512d operator()(__m512d in, __m512d ret){ inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
return _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2)); return _mm512_shuffle_pd(tmp,tmp,0x55);
} }
}; };
struct TimesI{ struct TimesI{
//Complex single //Complex single
inline __m512 operator()(__m512 in, __m512 ret){ inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)); __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
} }
//Complex double //Complex double
inline __m512d operator()(__m512d in, __m512d ret){ inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2)); __m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
} }
@ -239,6 +237,36 @@ namespace Optimization {
// Gpermute utilities consider coalescing into 1 Gpermute
struct Permute{
static inline __m512 Permute0(__m512 in){
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512 Permute1(__m512 in){
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512 Permute2(__m512 in){
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512 Permute3(__m512 in){
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512d Permute0(__m512d in){
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512d Permute1(__m512d in){
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512d Permute2(__m512d in){
return _mm512_shuffle_pd(in,in,0x55);
};
static inline __m512d Permute3(__m512d in){
return in;
};
};
////////////////////////////////////////////// //////////////////////////////////////////////
@ -299,25 +327,6 @@ namespace Grid {
// Gpermute utilities consider coalescing into 1 Gpermute
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
union {
__m512 f;
decltype(VectorSIMD::v) v;
} conv;
conv.v = b.v;
switch(perm){
case 3 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
case 2 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
default: assert(0); break;
}
y.v=conv.v;
};
// Function name aliases // Function name aliases
typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD; typedef Optimization::Vstore VstoreSIMD;

View File

@ -255,6 +255,35 @@ namespace Optimization {
}; };
struct Permute{
static inline __m512 Permute0(__m512 in){
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512 Permute1(__m512 in){
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512 Permute2(__m512 in){
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
};
static inline __m512 Permute3(__m512 in){
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB);
};
static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512d Permute1(__m512d in){
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
};
static inline __m512d Permute2(__m512d in){
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
};
static inline __m512d Permute3(__m512d in){
return in;
};
};
@ -316,25 +345,6 @@ namespace Grid {
// Gpermute utilities consider coalescing into 1 Gpermute
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
union {
__m512 f;
decltype(VectorSIMD::v) v;
} conv;
conv.v = b.v;
switch(perm){
case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break;
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
default: assert(0); break;
}
y.v=conv.v;
};
// Function name aliases // Function name aliases
typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD; typedef Optimization::Vstore VstoreSIMD;

View File

@ -151,10 +151,10 @@ namespace Optimization {
// Complex float // Complex float
inline __m128 operator()(__m128 a, __m128 b){ inline __m128 operator()(__m128 a, __m128 b){
__m128 ymm0,ymm1,ymm2; __m128 ymm0,ymm1,ymm2;
ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar, ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm_addsub_ps(ymm0,ymm1); return _mm_addsub_ps(ymm0,ymm1);
} }
@ -201,7 +201,7 @@ namespace Optimization {
//Complex single //Complex single
inline __m128 operator()(__m128 in, __m128 ret){ inline __m128 operator()(__m128 in, __m128 ret){
__m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
return _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
} }
//Complex double //Complex double
inline __m128d operator()(__m128d in, __m128d ret){ inline __m128d operator()(__m128d in, __m128d ret){
@ -215,7 +215,7 @@ namespace Optimization {
struct TimesI{ struct TimesI{
//Complex single //Complex single
inline __m128 operator()(__m128 in, __m128 ret){ inline __m128 operator()(__m128 in, __m128 ret){
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
} }
//Complex double //Complex double
@ -225,27 +225,45 @@ namespace Optimization {
} }
}; };
struct Permute{
static inline __m128 Permute0(__m128 in){
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m128 Permute1(__m128 in){
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m128 Permute2(__m128 in){
return in;
};
static inline __m128 Permute3(__m128 in){
return in;
};
static inline __m128d Permute0(__m128d in){
return _mm_shuffle_pd(in,in,0x1);
};
static inline __m128d Permute1(__m128d in){
return in;
};
static inline __m128d Permute2(__m128d in){
return in;
};
static inline __m128d Permute3(__m128d in){
return in;
};
};
////////////////////////////////////////////// //////////////////////////////////////////////
// Some Template specialization // Some Template specialization
template < typename vtype >
void permute(vtype &a, vtype b, int perm) {
uconv<vtype> conv;
conv.v = b;
switch(perm){
case 3: break; //empty for SSE4
case 2: break; //empty for SSE4
case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
default: assert(0); break;
}
a=conv.v;
};
//Complex float Reduce //Complex float Reduce
template<> template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){ inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
__m128 v1; // two complex __m128 v1; // two complex
Optimization::permute(v1,in,0); v1= Optimization::Permute::Permute0(in);
v1= _mm_add_ps(v1,in); v1= _mm_add_ps(v1,in);
u128f conv; conv.v=v1; u128f conv; conv.v=v1;
return Grid::ComplexF(conv.f[0],conv.f[1]); return Grid::ComplexF(conv.f[0],conv.f[1]);
@ -254,9 +272,9 @@ namespace Optimization {
template<> template<>
inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){ inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
__m128 v1,v2; // quad single __m128 v1,v2; // quad single
Optimization::permute(v1,in,0); v1= Optimization::Permute::Permute0(in);
v1= _mm_add_ps(v1,in); v1= _mm_add_ps(v1,in);
Optimization::permute(v2,v1,1); v2= Optimization::Permute::Permute1(v1);
v1 = _mm_add_ps(v1,v2); v1 = _mm_add_ps(v1,v2);
u128f conv; conv.v=v1; u128f conv; conv.v=v1;
return conv.f[0]; return conv.f[0];
@ -274,7 +292,7 @@ namespace Optimization {
template<> template<>
inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){ inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
__m128d v1; __m128d v1;
Optimization::permute(v1,in,0); // avx 256; quad double v1 = Optimization::Permute::Permute0(in);
v1 = _mm_add_pd(v1,in); v1 = _mm_add_pd(v1,in);
u128d conv; conv.v = v1; u128d conv; conv.v = v1;
return conv.f[0]; return conv.f[0];
@ -303,14 +321,6 @@ namespace Grid {
_mm_prefetch(ptr,_MM_HINT_T0); _mm_prefetch(ptr,_MM_HINT_T0);
} }
// Gpermute function
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
Optimization::permute(y.v,b.v,perm);
}
// Function name aliases // Function name aliases
typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD; typedef Optimization::Vstore VstoreSIMD;

View File

@ -251,14 +251,29 @@ namespace Grid {
// all subtypes; may not be a good assumption, but could // all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example // add the vector width as a template param for BG/Q for example
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
friend inline void permute0(Grid_simd &y,Grid_simd b){
y.v = Optimization::Permute::Permute0(b.v);
}
friend inline void permute1(Grid_simd &y,Grid_simd b){
y.v = Optimization::Permute::Permute1(b.v);
}
friend inline void permute2(Grid_simd &y,Grid_simd b){
y.v = Optimization::Permute::Permute2(b.v);
}
friend inline void permute3(Grid_simd &y,Grid_simd b){
y.v = Optimization::Permute::Permute3(b.v);
}
friend inline void permute(Grid_simd &y,Grid_simd b,int perm) friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
{ {
Gpermute<Grid_simd>(y,b,perm); if (perm==3) permute3(y,b);
else if (perm==2) permute2(y,b);
else if (perm==1) permute1(y,b);
else if (perm==0) permute0(y,b);
} }
};// end of Grid_simd class definition
};// end of Grid_simd class definition
/////////////////////// ///////////////////////
// Splat // Splat

View File

@ -177,6 +177,7 @@ public:
permute(out._internal[i],in._internal[i],permutetype); permute(out._internal[i],in._internal[i],permutetype);
} }
} }
// Unary negation // Unary negation
friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) { friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
iVector<vtype,N> ret; iVector<vtype,N> ret;
@ -290,12 +291,15 @@ public:
vstream(out._internal[i][j],in._internal[i][j]); vstream(out._internal[i][j],in._internal[i][j]);
}} }}
} }
friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){ friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
permute(out._internal[i][j],in._internal[i][j],permutetype); permute(out._internal[i][j],in._internal[i][j],permutetype);
}} }}
} }
// Unary negation // Unary negation
friend strong_inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) { friend strong_inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
iMatrix<vtype,N> ret; iMatrix<vtype,N> ret;

View File

@ -35,10 +35,10 @@ icpc-avx-openmp-mpi)
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
;; ;;
icpc-avx-openmp) icpc-avx-openmp)
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi CXX=icpc ../../configure --enable-precision=single --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
;; ;;
icpc-avx2) icpc-avx2)
CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-march=core-avx2 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;; ;;
icpc-avx512) icpc-avx512)
CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3 -std=c++11" --host=none LIBS="-lgmp -lmpfr" --enable-comms=none CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3 -std=c++11" --host=none LIBS="-lgmp -lmpfr" --enable-comms=none
@ -50,7 +50,7 @@ icpc-mic-avx512)
CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3 -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3 -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
;; ;;
clang-sse) clang-sse)
CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none CXX=clang++ ../../configure --enable-precision=single --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;; ;;
clang-avx) clang-avx)
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none