mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-13 20:57:06 +01:00
Lots of changes required to compile for MIC under ICPC
This commit is contained in:
@ -2,42 +2,42 @@
|
||||
/* lib/Grid_config.h.in. Generated from configure.ac by autoheader. */
|
||||
|
||||
/* AVX */
|
||||
#define AVX1 1
|
||||
/* #undef AVX1 */
|
||||
|
||||
/* AVX2 */
|
||||
/* #undef AVX2 */
|
||||
|
||||
/* AVX512 */
|
||||
/* #undef AVX512 */
|
||||
#define AVX512 1
|
||||
|
||||
/* GRID_COMMS_MPI */
|
||||
#define GRID_COMMS_MPI 1
|
||||
/* #undef GRID_COMMS_MPI */
|
||||
|
||||
/* GRID_COMMS_NONE */
|
||||
/* #undef GRID_COMMS_NONE */
|
||||
#define GRID_COMMS_NONE 1
|
||||
|
||||
/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
|
||||
don't. */
|
||||
#define HAVE_DECL_BE64TOH 0
|
||||
#define HAVE_DECL_BE64TOH 1
|
||||
|
||||
/* Define to 1 if you have the declaration of `ntohll', and to 0 if you don't.
|
||||
*/
|
||||
#define HAVE_DECL_NTOHLL 1
|
||||
#define HAVE_DECL_NTOHLL 0
|
||||
|
||||
/* Define to 1 if you have the <endian.h> header file. */
|
||||
/* #undef HAVE_ENDIAN_H */
|
||||
#define HAVE_ENDIAN_H 1
|
||||
|
||||
/* Define to 1 if you have the `gettimeofday' function. */
|
||||
#define HAVE_GETTIMEOFDAY 1
|
||||
/* #undef HAVE_GETTIMEOFDAY */
|
||||
|
||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
||||
#define HAVE_INTTYPES_H 1
|
||||
|
||||
/* Define to 1 if you have the <malloc.h> header file. */
|
||||
/* #undef HAVE_MALLOC_H */
|
||||
#define HAVE_MALLOC_H 1
|
||||
|
||||
/* Define to 1 if you have the <malloc/malloc.h> header file. */
|
||||
#define HAVE_MALLOC_MALLOC_H 1
|
||||
/* #undef HAVE_MALLOC_MALLOC_H */
|
||||
|
||||
/* Define to 1 if you have the <memory.h> header file. */
|
||||
#define HAVE_MEMORY_H 1
|
||||
@ -78,9 +78,6 @@
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "grid"
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "1.0"
|
||||
|
||||
|
@ -77,9 +77,6 @@
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#undef PACKAGE_TARNAME
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#undef PACKAGE_URL
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#undef PACKAGE_VERSION
|
||||
|
||||
|
@ -176,7 +176,7 @@ public:
|
||||
}; // class Lattice
|
||||
}
|
||||
|
||||
#define GRID_LATTICE_EXPRESSION_TEMPLATES
|
||||
#undef GRID_LATTICE_EXPRESSION_TEMPLATES
|
||||
|
||||
#include <lattice/Grid_lattice_conformable.h>
|
||||
|
||||
|
@ -20,6 +20,7 @@
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
#include <immintrin.h>
|
||||
#include <zmmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace Grid {
|
||||
@ -160,10 +161,10 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
|
||||
// Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn
|
||||
// Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
|
||||
// Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
|
||||
case 3: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break;
|
||||
case 2: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break;
|
||||
case 1: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 0: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
||||
case 3: y.v =(decltype(y.v)) _mm512_swizzle_ps((__m512)b.v,_MM_SWIZ_REG_CDAB); break;
|
||||
case 2: y.v =(decltype(y.v)) _mm512_swizzle_ps((__m512)b.v,_MM_SWIZ_REG_BADC); break;
|
||||
case 1: y.v =(decltype(y.v)) _mm512_permute4f128_ps((__m512)b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 0: y.v =(decltype(y.v)) _mm512_permute4f128_ps((__m512)b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#error not implemented
|
||||
|
@ -26,19 +26,18 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
|
||||
}
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
int o = n*rhs._grid->_slice_stride[dimension];
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb &cbmask ) {
|
||||
buffer[bo]=compress(rhs._odata[so+o+b]);
|
||||
bo++;
|
||||
}
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
@ -56,13 +55,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
||||
}
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int o=n*rhs._grid->_slice_stride[dimension];
|
||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||
@ -71,9 +70,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
||||
temp =compress(rhs._odata[so+o+b]);
|
||||
extract<cobj>(temp,pointers,offset);
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
@ -107,20 +104,17 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
|
||||
}
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int o=n*rhs._grid->_slice_stride[dimension];
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb & cbmask ) {
|
||||
rhs._odata[so+o+b]=buffer[bo++];
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
@ -136,21 +130,18 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
|
||||
}
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int o = n*rhs._grid->_slice_stride[dimension];
|
||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||
if ( ocb&cbmask ) {
|
||||
merge(rhs._odata[so+o+b],pointers,offset);
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
@ -168,20 +159,18 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
|
||||
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int o =n*rhs._grid->_slice_stride[dimension];
|
||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
||||
|
||||
if ( ocb&cbmask ) {
|
||||
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
@ -195,19 +184,16 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &r
|
||||
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int o =n*rhs._grid->_slice_stride[dimension];
|
||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
||||
if ( ocb&cbmask ) {
|
||||
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3,6 +3,17 @@
|
||||
|
||||
namespace Grid {
|
||||
|
||||
/*
|
||||
depbase=`echo Grid_main.o | sed 's|[^/]*$|.deps/&|;s|\.o$||'`;\
|
||||
icpc -DHAVE_CONFIG_H -I. -I../lib -I../lib -mmic -O3 -std=c++11 -fopenmp -MT Grid_main.o -MD -MP -MF $depbase.Tpo -c -o Grid_main.o Grid_main.cc &&\
|
||||
mv -f $depbase.Tpo $depbase.Po
|
||||
../lib/lattice/Grid_lattice_coordinate.h(25): error: no suitable user-defined conversion from "vector_type" to "const Grid::iScalar<Grid::iScalar<Grid::iScalar<Grid::vInteger>>>" exists
|
||||
l._odata[o]=vI;
|
||||
^
|
||||
detected during instantiation of "void Grid::LatticeCoordinate(Grid::Lattice<iobj> &, int) [with iobj=Grid::QCD::vTInteger]" at line 283 of "Grid_main.cc"
|
||||
|
||||
compilation aborted for Grid_main.cc (code 2)
|
||||
*/
|
||||
template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
||||
{
|
||||
typedef typename iobj::scalar_object scalar_object;
|
||||
@ -21,8 +32,8 @@ namespace Grid {
|
||||
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
|
||||
mergebuf[i]=(Integer)gcoor[mu];
|
||||
}
|
||||
AmergeA<vector_type,scalar_type>(vI,mergebuf);
|
||||
l._odata[o]=vI;
|
||||
merge<vector_type,scalar_type>(vI,mergebuf);
|
||||
l._odata[o]._internal._internal._internal=vI;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -118,3 +118,4 @@ namespace Grid {
|
||||
|
||||
|
||||
}
|
||||
#endif
|
||||
|
@ -45,10 +45,6 @@ public:
|
||||
zeroit(*this);
|
||||
return *this;
|
||||
}
|
||||
iScalar<vtype> & operator= (const scalar_type s){
|
||||
_internal=s;
|
||||
return *this;
|
||||
}
|
||||
friend void vstream(iScalar<vtype> &out,const iScalar<vtype> &in){
|
||||
vstream(out._internal,in._internal);
|
||||
}
|
||||
@ -98,10 +94,10 @@ public:
|
||||
operator ComplexD () const { return(TensorRemove(_internal)); };
|
||||
operator RealD () const { return(real(TensorRemove(_internal))); }
|
||||
|
||||
|
||||
// convert from a something to a scalar
|
||||
template<class T,typename std::enable_if<isGridTensor<T>::notvalue, T>::type* = nullptr > inline auto operator = (T arg) -> iScalar<vtype>
|
||||
{
|
||||
_internal = arg;
|
||||
_internal = vtype(arg);
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -107,12 +107,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
|
||||
int ss = sss;
|
||||
int ssu= sss;
|
||||
<<<<<<< HEAD
|
||||
//int ss = Stencil._LebesgueReorder[sss];
|
||||
=======
|
||||
//int ss = 0;
|
||||
//int ss = Stencil._LebesgueReorder[sss];
|
||||
>>>>>>> 55ccb8ccf4aa6b9d6e36e71aa19f89db0463254f
|
||||
|
||||
// Xp
|
||||
offset = Stencil._offsets [Xp][ss];
|
||||
@ -130,10 +125,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
prefetch(Umu._odata[ssu](Yp));
|
||||
>>>>>>> 55ccb8ccf4aa6b9d6e36e71aa19f89db0463254f
|
||||
//prefetch(Umu._odata[ssu](Yp));
|
||||
spReconXp(result,Uchi);
|
||||
|
||||
// Yp
|
||||
@ -152,10 +144,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
prefetch(Umu._odata[ssu](Zp));
|
||||
>>>>>>> 55ccb8ccf4aa6b9d6e36e71aa19f89db0463254f
|
||||
// prefetch(Umu._odata[ssu](Zp));
|
||||
accumReconYp(result,Uchi);
|
||||
|
||||
// Zp
|
||||
@ -174,10 +163,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
prefetch(Umu._odata[ssu](Tp));
|
||||
>>>>>>> 55ccb8ccf4aa6b9d6e36e71aa19f89db0463254f
|
||||
// prefetch(Umu._odata[ssu](Tp));
|
||||
accumReconZp(result,Uchi);
|
||||
|
||||
// Tp
|
||||
@ -196,10 +182,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
prefetch(Umu._odata[ssu](Xm));
|
||||
>>>>>>> 55ccb8ccf4aa6b9d6e36e71aa19f89db0463254f
|
||||
// prefetch(Umu._odata[ssu](Xm));
|
||||
accumReconTp(result,Uchi);
|
||||
|
||||
// Xm
|
||||
@ -218,10 +201,6 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
prefetch(Umu._odata[ssu](Ym));
|
||||
>>>>>>> 55ccb8ccf4aa6b9d6e36e71aa19f89db0463254f
|
||||
accumReconXm(result,Uchi);
|
||||
|
||||
|
||||
@ -241,10 +220,6 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
prefetch(Umu._odata[ssu](Zm));
|
||||
>>>>>>> 55ccb8ccf4aa6b9d6e36e71aa19f89db0463254f
|
||||
accumReconYm(result,Uchi);
|
||||
|
||||
// Zm
|
||||
@ -263,10 +238,6 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
prefetch(Umu._odata[ssu](Tm));
|
||||
>>>>>>> 55ccb8ccf4aa6b9d6e36e71aa19f89db0463254f
|
||||
accumReconZm(result,Uchi);
|
||||
|
||||
// Tm
|
||||
|
@ -143,10 +143,10 @@ namespace Grid {
|
||||
* }
|
||||
*/
|
||||
zvec vzero,ymm0,ymm1,real,imag;
|
||||
vzero = _mm512_setzero();
|
||||
vzero =(zvec)_mm512_setzero();
|
||||
ymm0 = _mm512_swizzle_pd(a.v, _MM_SWIZ_REG_CDAB); //
|
||||
real = _mm512_mask_or_epi64(a.v, 0xAAAA,vzero, ymm0);
|
||||
imag = _mm512_mask_sub_pd(a.v, 0x5555,vzero, ymm0);
|
||||
real =(zvec)_mm512_mask_or_epi64((__m512i)a.v, 0xAA,(__m512i)vzero,(__m512i) ymm0);
|
||||
imag = _mm512_mask_sub_pd(a.v, 0x55,vzero, ymm0);
|
||||
ymm1 = _mm512_mul_pd(real, b.v);
|
||||
ymm0 = _mm512_swizzle_pd(b.v, _MM_SWIZ_REG_CDAB); // OK
|
||||
ret.v= _mm512_fmadd_pd(ymm0,imag,ymm1);
|
||||
@ -250,7 +250,8 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
||||
_mm_stream_pd((double *)&out.v,in.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
_mm512_stream_pd((double *)&out.v,in.v);
|
||||
_mm512_storenrngo_pd((double *)&out.v,in.v);
|
||||
// _mm512_stream_pd((double *)&out.v,in.v);
|
||||
//Note v has a3 a2 a1 a0
|
||||
#endif
|
||||
#ifdef QPX
|
||||
@ -277,7 +278,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
||||
ret.v = _mm_shuffle_pd(tmp,tmp,0x1);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_mask_sub_pd(in.v, 0xaaaa,ret.v, in.v);
|
||||
ret.v = _mm512_mask_sub_pd(in.v, 0xaa,ret.v, in.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
assert(0);
|
||||
@ -297,7 +298,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
||||
ret.v =_mm_shuffle_pd(tmp.v,tmp.v,0x1);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_mask_sub_pd(in.v,0xaaaa,ret.v,in.v); // real -imag
|
||||
ret.v = _mm512_mask_sub_pd(in.v,0xaa,ret.v,in.v); // real -imag
|
||||
ret.v = _mm512_swizzle_pd(ret.v, _MM_SWIZ_REG_CDAB);// OK
|
||||
#endif
|
||||
#ifdef QPX
|
||||
@ -319,7 +320,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
tmp.v = _mm512_swizzle_pd(in.v, _MM_SWIZ_REG_CDAB);// OK
|
||||
ret.v = _mm512_mask_sub_pd(tmp.v,0xaaaa,ret.v,tmp.v); // real -imag
|
||||
ret.v = _mm512_mask_sub_pd(tmp.v,0xaa,ret.v,tmp.v); // real -imag
|
||||
#endif
|
||||
#ifdef QPX
|
||||
assert(0);
|
||||
@ -337,7 +338,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
||||
return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
return ComplexD(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v));
|
||||
return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#endif
|
||||
|
@ -133,7 +133,7 @@ namespace Grid {
|
||||
cvec vzero,ymm0,ymm1,real, imag;
|
||||
vzero = _mm512_setzero();
|
||||
ymm0 = _mm512_swizzle_ps(a.v, _MM_SWIZ_REG_CDAB); //
|
||||
real = _mm512_mask_or_epi32(a.v, 0xAAAA,vzero, ymm0);
|
||||
real = (__m512)_mm512_mask_or_epi32((__m512i)a.v, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
|
||||
imag = _mm512_mask_sub_ps(a.v, 0x5555,vzero, ymm0);
|
||||
ymm1 = _mm512_mul_ps(real, b.v);
|
||||
ymm0 = _mm512_swizzle_ps(b.v, _MM_SWIZ_REG_CDAB); // OK
|
||||
@ -199,7 +199,8 @@ namespace Grid {
|
||||
_mm_stream_ps((float *)&out.v,in.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
_mm512_stream_ps((float *)&out.v,in.v);
|
||||
_mm512_storenrngo_ps((float *)&out.v,in.v);
|
||||
// _mm512_stream_ps((float *)&out.v,in.v);
|
||||
//Note v has a3 a2 a1 a0
|
||||
#endif
|
||||
#ifdef QPX
|
||||
|
@ -3,8 +3,10 @@
|
||||
|
||||
namespace Grid {
|
||||
|
||||
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
|
||||
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
|
||||
#ifndef _mm256_set_m128i
|
||||
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
|
||||
#endif
|
||||
|
||||
typedef uint32_t Integer;
|
||||
|
||||
@ -110,7 +112,8 @@ namespace Grid {
|
||||
ret.v = _mm_mul_epi32(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_mul_epi32(a.v,b.v);
|
||||
// ret.v = _mm512_mul_epi32(a.v,b.v);
|
||||
ret.v = _mm512_mullo_epi32(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
// Implement as array of ints is only option
|
||||
|
@ -184,7 +184,7 @@ namespace Grid {
|
||||
_mm_stream_pd((double *)&out.v,in.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
_mm512_stream_pd((double *)&out.v,in.v);
|
||||
_mm512_storenrngo_pd((double *)&out.v,in.v);
|
||||
//Note v has a3 a2 a1 a0
|
||||
#endif
|
||||
#ifdef QPX
|
||||
|
@ -216,7 +216,8 @@ friend inline void vstore(const vRealF &ret, float *a){
|
||||
_mm_stream_ps((float *)&out.v,in.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
_mm512_stream_ps((float *)&out.v,in.v);
|
||||
_mm512_storenrngo_ps((float *)&out.v,in.v);
|
||||
// _mm512_stream_ps((float *)&out.v,in.v);
|
||||
//Note v has a3 a2 a1 a0
|
||||
#endif
|
||||
#ifdef QPX
|
||||
|
Reference in New Issue
Block a user