mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Updating to modify non-inlining permute routines and hopefully get better reg use and
enhance performance.
This commit is contained in:
parent
5ef42add2d
commit
64d64d1ab6
@ -18,8 +18,8 @@
|
|||||||
#include <algorithms/iterative/ConjugateGradientMultiShift.h>
|
#include <algorithms/iterative/ConjugateGradientMultiShift.h>
|
||||||
|
|
||||||
// Lanczos support
|
// Lanczos support
|
||||||
#include <algorithms/iterative/MatrixUtils.h>
|
//#include <algorithms/iterative/MatrixUtils.h>
|
||||||
#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
//#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
||||||
|
|
||||||
#include <algorithms/CoarsenedMatrix.h>
|
#include <algorithms/CoarsenedMatrix.h>
|
||||||
|
|
||||||
|
@ -178,6 +178,7 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
|
||||||
QCD::WilsonFermionStatic::HandOptDslash=1;
|
QCD::WilsonFermionStatic::HandOptDslash=1;
|
||||||
|
QCD::WilsonFermion5DStatic::HandOptDslash=1;
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
||||||
LebesgueOrder::UseLebesgueOrder=1;
|
LebesgueOrder::UseLebesgueOrder=1;
|
||||||
|
@ -13,6 +13,11 @@
|
|||||||
|
|
||||||
typedef uint32_t Integer;
|
typedef uint32_t Integer;
|
||||||
|
|
||||||
|
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
|
||||||
|
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
|
||||||
|
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
|
||||||
|
#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
typedef float RealF;
|
typedef float RealF;
|
||||||
|
@ -56,13 +56,13 @@
|
|||||||
UChi_02+= U_20*Chi_02;\
|
UChi_02+= U_20*Chi_02;\
|
||||||
UChi_12+= U_20*Chi_12;
|
UChi_12+= U_20*Chi_12;
|
||||||
|
|
||||||
#define PERMUTE\
|
#define PERMUTE_DIR(dir) \
|
||||||
permute(Chi_00,Chi_00,ptype);\
|
permute##dir(Chi_00,Chi_00);\
|
||||||
permute(Chi_01,Chi_01,ptype);\
|
permute##dir(Chi_01,Chi_01);\
|
||||||
permute(Chi_02,Chi_02,ptype);\
|
permute##dir(Chi_02,Chi_02);\
|
||||||
permute(Chi_10,Chi_10,ptype);\
|
permute##dir(Chi_10,Chi_10);\
|
||||||
permute(Chi_11,Chi_11,ptype);\
|
permute##dir(Chi_11,Chi_11);\
|
||||||
permute(Chi_12,Chi_12,ptype);
|
permute##dir(Chi_12,Chi_12);
|
||||||
|
|
||||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||||
@ -286,6 +286,10 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
|||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
|
// std::cout << "Hand op Dhop "<<std::endl;
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
REGISTER Simd result_00; // 12 regs on knc
|
REGISTER Simd result_00; // 12 regs on knc
|
||||||
REGISTER Simd result_01;
|
REGISTER Simd result_01;
|
||||||
REGISTER Simd result_02;
|
REGISTER Simd result_02;
|
||||||
@ -352,7 +356,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
XP_PROJ;
|
XP_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -373,7 +377,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
YP_PROJ;
|
YP_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -394,7 +398,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
ZP_PROJ;
|
ZP_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -414,7 +418,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
TP_PROJ;
|
TP_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -434,7 +438,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
XM_PROJ;
|
XM_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -454,7 +458,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
YM_PROJ;
|
YM_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -474,7 +478,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
ZM_PROJ;
|
ZM_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -494,7 +498,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
TM_PROJ;
|
TM_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -526,6 +530,9 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
|||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
REGISTER Simd result_00; // 12 regs on knc
|
REGISTER Simd result_00; // 12 regs on knc
|
||||||
REGISTER Simd result_01;
|
REGISTER Simd result_01;
|
||||||
REGISTER Simd result_02;
|
REGISTER Simd result_02;
|
||||||
@ -592,7 +599,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
XM_PROJ;
|
XM_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -612,7 +619,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
YM_PROJ;
|
YM_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -633,7 +640,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
ZM_PROJ;
|
ZM_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -653,7 +660,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
TM_PROJ;
|
TM_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -673,7 +680,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
XP_PROJ;
|
XP_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -694,7 +701,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
YP_PROJ;
|
YP_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -714,7 +721,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
ZP_PROJ;
|
ZP_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
@ -734,7 +741,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG
|
|||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
TP_PROJ;
|
TP_PROJ;
|
||||||
if ( perm) {
|
if ( perm) {
|
||||||
PERMUTE;
|
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
|
@ -183,11 +183,11 @@ namespace Optimization {
|
|||||||
// Complex float
|
// Complex float
|
||||||
inline __m256 operator()(__m256 a, __m256 b){
|
inline __m256 operator()(__m256 a, __m256 b){
|
||||||
__m256 ymm0,ymm1,ymm2;
|
__m256 ymm0,ymm1,ymm2;
|
||||||
ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
|
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||||
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||||
// FIXME AVX2 could MAC
|
// FIXME AVX2 could MAC
|
||||||
ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
|
ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
|
||||||
ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
|
ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
|
||||||
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||||
return _mm256_addsub_ps(ymm0,ymm1);
|
return _mm256_addsub_ps(ymm0,ymm1);
|
||||||
}
|
}
|
||||||
@ -270,7 +270,7 @@ namespace Optimization {
|
|||||||
//Complex single
|
//Complex single
|
||||||
inline __m256 operator()(__m256 in, __m256 ret){
|
inline __m256 operator()(__m256 in, __m256 ret){
|
||||||
__m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i
|
__m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i
|
||||||
return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
|
return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
inline __m256d operator()(__m256d in, __m256d ret){
|
inline __m256d operator()(__m256d in, __m256d ret){
|
||||||
@ -282,7 +282,7 @@ namespace Optimization {
|
|||||||
struct TimesI{
|
struct TimesI{
|
||||||
//Complex single
|
//Complex single
|
||||||
inline __m256 operator()(__m256 in, __m256 ret){
|
inline __m256 operator()(__m256 in, __m256 ret){
|
||||||
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r
|
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
|
||||||
return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r
|
return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
@ -296,27 +296,44 @@ namespace Optimization {
|
|||||||
// Some Template specialization
|
// Some Template specialization
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
|
|
||||||
template < typename vtype >
|
struct Permute{
|
||||||
void permute(vtype &a,vtype b, int perm) {
|
|
||||||
uconv<vtype> conv;
|
static inline __m256 Permute0(__m256 in){
|
||||||
conv.v = b;
|
return _mm256_permute2f128_ps(in,in,0x01);
|
||||||
switch (perm){
|
};
|
||||||
// 8x32 bits=>3 permutes
|
static inline __m256 Permute1(__m256 in){
|
||||||
case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
|
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||||
case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
|
};
|
||||||
case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
|
static inline __m256 Permute2(__m256 in){
|
||||||
default: assert(0); break;
|
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||||
}
|
};
|
||||||
a = conv.v;
|
static inline __m256 Permute3(__m256 in){
|
||||||
}
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline __m256d Permute0(__m256d in){
|
||||||
|
return _mm256_permute2f128_pd(in,in,0x01);
|
||||||
|
};
|
||||||
|
static inline __m256d Permute1(__m256d in){
|
||||||
|
return _mm256_shuffle_pd(in,in,0x5);
|
||||||
|
};
|
||||||
|
static inline __m256d Permute2(__m256d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline __m256d Permute3(__m256d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
//Complex float Reduce
|
//Complex float Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
|
inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
|
||||||
__m256 v1,v2;
|
__m256 v1,v2;
|
||||||
Optimization::permute(v1,in,0); // avx 256; quad complex single
|
v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single
|
||||||
v1 = _mm256_add_ps(v1,in);
|
v1= _mm256_add_ps(v1,in);
|
||||||
Optimization::permute(v2,v1,1);
|
v2=Optimization::Permute::Permute1(v1);
|
||||||
v1 = _mm256_add_ps(v1,v2);
|
v1 = _mm256_add_ps(v1,v2);
|
||||||
u256f conv; conv.v = v1;
|
u256f conv; conv.v = v1;
|
||||||
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||||
@ -326,11 +343,11 @@ namespace Optimization {
|
|||||||
template<>
|
template<>
|
||||||
inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
|
inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
|
||||||
__m256 v1,v2;
|
__m256 v1,v2;
|
||||||
Optimization::permute(v1,in,0); // avx 256; octo-double
|
v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double
|
||||||
v1 = _mm256_add_ps(v1,in);
|
v1 = _mm256_add_ps(v1,in);
|
||||||
Optimization::permute(v2,v1,1);
|
v2 = Optimization::Permute::Permute1(v1);
|
||||||
v1 = _mm256_add_ps(v1,v2);
|
v1 = _mm256_add_ps(v1,v2);
|
||||||
Optimization::permute(v2,v1,2);
|
v2 = Optimization::Permute::Permute2(v1);
|
||||||
v1 = _mm256_add_ps(v1,v2);
|
v1 = _mm256_add_ps(v1,v2);
|
||||||
u256f conv; conv.v=v1;
|
u256f conv; conv.v=v1;
|
||||||
return conv.f[0];
|
return conv.f[0];
|
||||||
@ -341,7 +358,7 @@ namespace Optimization {
|
|||||||
template<>
|
template<>
|
||||||
inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
|
inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
|
||||||
__m256d v1;
|
__m256d v1;
|
||||||
Optimization::permute(v1,in,0); // sse 128; paired complex single
|
v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
|
||||||
v1 = _mm256_add_pd(v1,in);
|
v1 = _mm256_add_pd(v1,in);
|
||||||
u256d conv; conv.v = v1;
|
u256d conv; conv.v = v1;
|
||||||
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
||||||
@ -351,9 +368,9 @@ namespace Optimization {
|
|||||||
template<>
|
template<>
|
||||||
inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
|
inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
|
||||||
__m256d v1,v2;
|
__m256d v1,v2;
|
||||||
Optimization::permute(v1,in,0); // avx 256; quad double
|
v1 = Optimization::Permute::Permute0(in); // avx 256; quad double
|
||||||
v1 = _mm256_add_pd(v1,in);
|
v1 = _mm256_add_pd(v1,in);
|
||||||
Optimization::permute(v2,v1,1);
|
v2 = Optimization::Permute::Permute1(v1);
|
||||||
v1 = _mm256_add_pd(v1,v2);
|
v1 = _mm256_add_pd(v1,v2);
|
||||||
u256d conv; conv.v = v1;
|
u256d conv; conv.v = v1;
|
||||||
return conv.f[0];
|
return conv.f[0];
|
||||||
@ -387,13 +404,6 @@ namespace Grid {
|
|||||||
_mm_prefetch(ptr,_MM_HINT_T0);
|
_mm_prefetch(ptr,_MM_HINT_T0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template < typename VectorSIMD >
|
|
||||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
|
||||||
Optimization::permute(y.v,b.v,perm);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Function name aliases
|
// Function name aliases
|
||||||
typedef Optimization::Vsplat VsplatSIMD;
|
typedef Optimization::Vsplat VsplatSIMD;
|
||||||
typedef Optimization::Vstore VstoreSIMD;
|
typedef Optimization::Vstore VstoreSIMD;
|
||||||
|
@ -211,26 +211,24 @@ namespace Optimization {
|
|||||||
//Complex single
|
//Complex single
|
||||||
inline __m512 operator()(__m512 in, __m512 ret){
|
inline __m512 operator()(__m512 in, __m512 ret){
|
||||||
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
||||||
return _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
|
return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); // 0x4E??
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
inline __m512d operator()(__m512d in, __m512d ret){
|
inline __m512d operator()(__m512d in, __m512d ret){
|
||||||
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
||||||
return _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
|
return _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct TimesI{
|
struct TimesI{
|
||||||
//Complex single
|
//Complex single
|
||||||
inline __m512 operator()(__m512 in, __m512 ret){
|
inline __m512 operator()(__m512 in, __m512 ret){
|
||||||
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
|
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||||
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
|
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
inline __m512d operator()(__m512d in, __m512d ret){
|
inline __m512d operator()(__m512d in, __m512d ret){
|
||||||
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
|
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||||
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
|
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -239,6 +237,36 @@ namespace Optimization {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Gpermute utilities consider coalescing into 1 Gpermute
|
||||||
|
struct Permute{
|
||||||
|
|
||||||
|
static inline __m512 Permute0(__m512 in){
|
||||||
|
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||||
|
};
|
||||||
|
static inline __m512 Permute1(__m512 in){
|
||||||
|
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||||
|
};
|
||||||
|
static inline __m512 Permute2(__m512 in){
|
||||||
|
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||||
|
};
|
||||||
|
static inline __m512 Permute3(__m512 in){
|
||||||
|
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline __m512d Permute0(__m512d in){
|
||||||
|
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||||
|
};
|
||||||
|
static inline __m512d Permute1(__m512d in){
|
||||||
|
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||||
|
};
|
||||||
|
static inline __m512d Permute2(__m512d in){
|
||||||
|
return _mm512_shuffle_pd(in,in,0x55);
|
||||||
|
};
|
||||||
|
static inline __m512d Permute3(__m512d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
@ -299,25 +327,6 @@ namespace Grid {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Gpermute utilities consider coalescing into 1 Gpermute
|
|
||||||
template < typename VectorSIMD >
|
|
||||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
|
||||||
union {
|
|
||||||
__m512 f;
|
|
||||||
decltype(VectorSIMD::v) v;
|
|
||||||
} conv;
|
|
||||||
conv.v = b.v;
|
|
||||||
switch(perm){
|
|
||||||
case 3 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
|
|
||||||
case 2 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
|
|
||||||
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
|
||||||
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
|
||||||
default: assert(0); break;
|
|
||||||
}
|
|
||||||
y.v=conv.v;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Function name aliases
|
// Function name aliases
|
||||||
typedef Optimization::Vsplat VsplatSIMD;
|
typedef Optimization::Vsplat VsplatSIMD;
|
||||||
typedef Optimization::Vstore VstoreSIMD;
|
typedef Optimization::Vstore VstoreSIMD;
|
||||||
|
@ -255,6 +255,35 @@ namespace Optimization {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct Permute{
|
||||||
|
|
||||||
|
static inline __m512 Permute0(__m512 in){
|
||||||
|
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||||
|
};
|
||||||
|
static inline __m512 Permute1(__m512 in){
|
||||||
|
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||||
|
};
|
||||||
|
static inline __m512 Permute2(__m512 in){
|
||||||
|
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
|
||||||
|
};
|
||||||
|
static inline __m512 Permute3(__m512 in){
|
||||||
|
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB);
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
|
||||||
|
return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||||
|
};
|
||||||
|
static inline __m512d Permute1(__m512d in){
|
||||||
|
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
|
||||||
|
};
|
||||||
|
static inline __m512d Permute2(__m512d in){
|
||||||
|
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
|
||||||
|
};
|
||||||
|
static inline __m512d Permute3(__m512d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -316,25 +345,6 @@ namespace Grid {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Gpermute utilities consider coalescing into 1 Gpermute
|
|
||||||
template < typename VectorSIMD >
|
|
||||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
|
||||||
union {
|
|
||||||
__m512 f;
|
|
||||||
decltype(VectorSIMD::v) v;
|
|
||||||
} conv;
|
|
||||||
conv.v = b.v;
|
|
||||||
switch(perm){
|
|
||||||
case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
|
|
||||||
case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break;
|
|
||||||
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
|
||||||
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
|
||||||
default: assert(0); break;
|
|
||||||
}
|
|
||||||
y.v=conv.v;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Function name aliases
|
// Function name aliases
|
||||||
typedef Optimization::Vsplat VsplatSIMD;
|
typedef Optimization::Vsplat VsplatSIMD;
|
||||||
typedef Optimization::Vstore VstoreSIMD;
|
typedef Optimization::Vstore VstoreSIMD;
|
||||||
|
@ -151,10 +151,10 @@ namespace Optimization {
|
|||||||
// Complex float
|
// Complex float
|
||||||
inline __m128 operator()(__m128 a, __m128 b){
|
inline __m128 operator()(__m128 a, __m128 b){
|
||||||
__m128 ymm0,ymm1,ymm2;
|
__m128 ymm0,ymm1,ymm2;
|
||||||
ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
|
ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||||
ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||||
ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
|
ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
|
||||||
ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
|
ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
|
||||||
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||||
return _mm_addsub_ps(ymm0,ymm1);
|
return _mm_addsub_ps(ymm0,ymm1);
|
||||||
}
|
}
|
||||||
@ -201,7 +201,7 @@ namespace Optimization {
|
|||||||
//Complex single
|
//Complex single
|
||||||
inline __m128 operator()(__m128 in, __m128 ret){
|
inline __m128 operator()(__m128 in, __m128 ret){
|
||||||
__m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
|
__m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
|
||||||
return _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
|
return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
inline __m128d operator()(__m128d in, __m128d ret){
|
inline __m128d operator()(__m128d in, __m128d ret){
|
||||||
@ -215,7 +215,7 @@ namespace Optimization {
|
|||||||
struct TimesI{
|
struct TimesI{
|
||||||
//Complex single
|
//Complex single
|
||||||
inline __m128 operator()(__m128 in, __m128 ret){
|
inline __m128 operator()(__m128 in, __m128 ret){
|
||||||
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1));
|
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||||
return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
|
return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
@ -225,27 +225,45 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct Permute{
|
||||||
|
|
||||||
|
static inline __m128 Permute0(__m128 in){
|
||||||
|
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||||
|
};
|
||||||
|
static inline __m128 Permute1(__m128 in){
|
||||||
|
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||||
|
};
|
||||||
|
static inline __m128 Permute2(__m128 in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline __m128 Permute3(__m128 in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline __m128d Permute0(__m128d in){
|
||||||
|
return _mm_shuffle_pd(in,in,0x1);
|
||||||
|
};
|
||||||
|
static inline __m128d Permute1(__m128d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline __m128d Permute2(__m128d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline __m128d Permute3(__m128d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// Some Template specialization
|
// Some Template specialization
|
||||||
template < typename vtype >
|
|
||||||
void permute(vtype &a, vtype b, int perm) {
|
|
||||||
uconv<vtype> conv;
|
|
||||||
conv.v = b;
|
|
||||||
switch(perm){
|
|
||||||
case 3: break; //empty for SSE4
|
|
||||||
case 2: break; //empty for SSE4
|
|
||||||
case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
|
|
||||||
case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
|
|
||||||
default: assert(0); break;
|
|
||||||
}
|
|
||||||
a=conv.v;
|
|
||||||
};
|
|
||||||
|
|
||||||
//Complex float Reduce
|
//Complex float Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
|
inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
|
||||||
__m128 v1; // two complex
|
__m128 v1; // two complex
|
||||||
Optimization::permute(v1,in,0);
|
v1= Optimization::Permute::Permute0(in);
|
||||||
v1= _mm_add_ps(v1,in);
|
v1= _mm_add_ps(v1,in);
|
||||||
u128f conv; conv.v=v1;
|
u128f conv; conv.v=v1;
|
||||||
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||||
@ -254,9 +272,9 @@ namespace Optimization {
|
|||||||
template<>
|
template<>
|
||||||
inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
|
inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
|
||||||
__m128 v1,v2; // quad single
|
__m128 v1,v2; // quad single
|
||||||
Optimization::permute(v1,in,0);
|
v1= Optimization::Permute::Permute0(in);
|
||||||
v1= _mm_add_ps(v1,in);
|
v1= _mm_add_ps(v1,in);
|
||||||
Optimization::permute(v2,v1,1);
|
v2= Optimization::Permute::Permute1(v1);
|
||||||
v1 = _mm_add_ps(v1,v2);
|
v1 = _mm_add_ps(v1,v2);
|
||||||
u128f conv; conv.v=v1;
|
u128f conv; conv.v=v1;
|
||||||
return conv.f[0];
|
return conv.f[0];
|
||||||
@ -274,7 +292,7 @@ namespace Optimization {
|
|||||||
template<>
|
template<>
|
||||||
inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
|
inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
|
||||||
__m128d v1;
|
__m128d v1;
|
||||||
Optimization::permute(v1,in,0); // avx 256; quad double
|
v1 = Optimization::Permute::Permute0(in);
|
||||||
v1 = _mm_add_pd(v1,in);
|
v1 = _mm_add_pd(v1,in);
|
||||||
u128d conv; conv.v = v1;
|
u128d conv; conv.v = v1;
|
||||||
return conv.f[0];
|
return conv.f[0];
|
||||||
@ -303,14 +321,6 @@ namespace Grid {
|
|||||||
_mm_prefetch(ptr,_MM_HINT_T0);
|
_mm_prefetch(ptr,_MM_HINT_T0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Gpermute function
|
|
||||||
template < typename VectorSIMD >
|
|
||||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
|
||||||
Optimization::permute(y.v,b.v,perm);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Function name aliases
|
// Function name aliases
|
||||||
typedef Optimization::Vsplat VsplatSIMD;
|
typedef Optimization::Vsplat VsplatSIMD;
|
||||||
typedef Optimization::Vstore VstoreSIMD;
|
typedef Optimization::Vstore VstoreSIMD;
|
||||||
|
@ -251,14 +251,29 @@ namespace Grid {
|
|||||||
// all subtypes; may not be a good assumption, but could
|
// all subtypes; may not be a good assumption, but could
|
||||||
// add the vector width as a template param for BG/Q for example
|
// add the vector width as a template param for BG/Q for example
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
|
friend inline void permute0(Grid_simd &y,Grid_simd b){
|
||||||
|
y.v = Optimization::Permute::Permute0(b.v);
|
||||||
|
}
|
||||||
|
friend inline void permute1(Grid_simd &y,Grid_simd b){
|
||||||
|
y.v = Optimization::Permute::Permute1(b.v);
|
||||||
|
}
|
||||||
|
friend inline void permute2(Grid_simd &y,Grid_simd b){
|
||||||
|
y.v = Optimization::Permute::Permute2(b.v);
|
||||||
|
}
|
||||||
|
friend inline void permute3(Grid_simd &y,Grid_simd b){
|
||||||
|
y.v = Optimization::Permute::Permute3(b.v);
|
||||||
|
}
|
||||||
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
|
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
|
||||||
{
|
{
|
||||||
Gpermute<Grid_simd>(y,b,perm);
|
if (perm==3) permute3(y,b);
|
||||||
|
else if (perm==2) permute2(y,b);
|
||||||
|
else if (perm==1) permute1(y,b);
|
||||||
|
else if (perm==0) permute0(y,b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
};// end of Grid_simd class definition
|
|
||||||
|
|
||||||
|
};// end of Grid_simd class definition
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Splat
|
// Splat
|
||||||
|
@ -177,6 +177,7 @@ public:
|
|||||||
permute(out._internal[i],in._internal[i],permutetype);
|
permute(out._internal[i],in._internal[i],permutetype);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unary negation
|
// Unary negation
|
||||||
friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
|
friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
|
||||||
iVector<vtype,N> ret;
|
iVector<vtype,N> ret;
|
||||||
@ -290,12 +291,15 @@ public:
|
|||||||
vstream(out._internal[i][j],in._internal[i][j]);
|
vstream(out._internal[i][j],in._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
|
friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
permute(out._internal[i][j],in._internal[i][j],permutetype);
|
permute(out._internal[i][j],in._internal[i][j],permutetype);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Unary negation
|
// Unary negation
|
||||||
friend strong_inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
|
friend strong_inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
|
||||||
iMatrix<vtype,N> ret;
|
iMatrix<vtype,N> ret;
|
||||||
|
@ -35,10 +35,10 @@ icpc-avx-openmp-mpi)
|
|||||||
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
|
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
|
||||||
;;
|
;;
|
||||||
icpc-avx-openmp)
|
icpc-avx-openmp)
|
||||||
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
|
CXX=icpc ../../configure --enable-precision=single --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
|
||||||
;;
|
;;
|
||||||
icpc-avx2)
|
icpc-avx2)
|
||||||
CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-march=core-avx2 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
||||||
;;
|
;;
|
||||||
icpc-avx512)
|
icpc-avx512)
|
||||||
CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3 -std=c++11" --host=none LIBS="-lgmp -lmpfr" --enable-comms=none
|
CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3 -std=c++11" --host=none LIBS="-lgmp -lmpfr" --enable-comms=none
|
||||||
@ -50,7 +50,7 @@ icpc-mic-avx512)
|
|||||||
CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3 -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
|
CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3 -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
|
||||||
;;
|
;;
|
||||||
clang-sse)
|
clang-sse)
|
||||||
CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
CXX=clang++ ../../configure --enable-precision=single --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
||||||
;;
|
;;
|
||||||
clang-avx)
|
clang-avx)
|
||||||
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
|
||||||
|
Loading…
Reference in New Issue
Block a user