1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Improving the trace support to support any index tracing and simplifying

implmentation in some ways
This commit is contained in:
Peter Boyle 2015-04-16 14:47:28 +01:00
parent fddb904b4c
commit 933c54d9c4
12 changed files with 321 additions and 87 deletions

View File

@ -7,12 +7,11 @@ namespace QCD {
static const int Ns=4;
static const int CbRed =0;
static const int CbBlack=1;
// QCD iMatrix types
template<typename vtype> using iSinglet = iScalar<iScalar<vtype> > ;
template<typename vtype> using iSpinMatrix = iMatrix<iScalar<vtype>, Ns>;
template<typename vtype> using iSpinColourMatrix = iMatrix<iMatrix<vtype, Nc>, Ns>;
template<typename vtype> using iColourMatrix = iScalar<iMatrix<vtype, Nc>> ;
template<typename vtype> using iSpinVector = iVector<iScalar<vtype>, Ns>;
@ -20,10 +19,8 @@ namespace QCD {
template<typename vtype> using iSpinColourVector = iVector<iVector<vtype, Nc>, Ns>;
typedef iSinglet<Complex > TComplex; // This is painful. Tensor singlet complex type.
typedef iSinglet<vComplex > vTComplex;
typedef iSinglet<Real > TReal; // This is painful. Tensor singlet complex type.
typedef iSinglet<vComplex > vTComplex; // what if we don't know the tensor structure
typedef iSinglet<Real > TReal; // Shouldn't need these.
typedef iSinglet<vInteger > vTInteger;
typedef iSpinMatrix<Complex > SpinMatrix;
@ -44,7 +41,6 @@ namespace QCD {
typedef iSpinColourVector<vComplex > vSpinColourVector;
typedef Lattice<vTComplex> LatticeComplex;
typedef Lattice<vInteger> LatticeInteger; // Predicates for "where"
typedef Lattice<vColourMatrix> LatticeColourMatrix;

View File

@ -73,8 +73,8 @@
/* Define to the version of this package. */
#define PACKAGE_VERSION "1.0"
/* SSE2 */
/* #undef SSE2 */
/* SSE4 */
/* #undef SSE4 */
/* Define to 1 if you have the ANSI C header files. */
#define STDC_HEADERS 1

View File

@ -152,6 +152,8 @@ int main (int argc, char ** argv)
cMat= adj(cMat);
cm=adj(cm);
scm=adj(scm);
scm=transpose(scm);
scm=transposeIndex<1>(scm);
// Foo = Foo+scalar; // LatticeColourMatrix+Scalar
// Foo = Foo*scalar; // LatticeColourMatrix*Scalar
@ -162,6 +164,23 @@ int main (int argc, char ** argv)
LatticeComplex trscMat(&Fine);
trscMat = trace(scMat); // Trace
{
TComplex c;
ColourMatrix c_m;
SpinMatrix s_m;
SpinColourMatrix sc_m;
s_m = traceIndex<1>(sc_m);
c_m = traceIndex<2>(sc_m);
c = traceIndex<2>(s_m);
c = traceIndex<1>(c_m);
printf("c. Level %d\n",c_m.TensorLevel);
printf("c. Level %d\n",c_m._internal.TensorLevel);
}
FooBar = Bar;
@ -203,7 +222,7 @@ int main (int argc, char ** argv)
peekSite(bar,Bar,coor);
for(int r=0;r<3;r++){
for(int c=0;c<3;c++){
cout<<"bar "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "<<bar._internal._internal[r][c]<<std::endl;
// cout<<"bar "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "<<bar._internal._internal[r][c]<<std::endl;
}}
}}}}
}

View File

@ -1,8 +1,22 @@
#ifndef GRID_MATH_TYPE_MAPPER_H
#define GRID_MATH_TYPE_MAPPER_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////////////
// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
// Use of a helper class like this allows us to template specialise and "dress"
// other classes such as RealD == double, ComplexD == std::complex<double> with these
// traits.
//
// It is possible that we could do this more elegantly if I introduced a
// queryable trait in iScalar, iMatrix and iVector and used the query on vtype in
// place of the type mapper?
//
// Not sure how to do this, but probably could be done with a research effort
// to study C++11's type_traits.h file. (std::enable_if<isGridTensorType<vtype> >)
//
//////////////////////////////////////////////////////////////////////////////////
template <class T> class GridTypeMapper {
@ -10,7 +24,7 @@ namespace Grid {
typedef typename T::scalar_type scalar_type;
typedef typename T::vector_type vector_type;
typedef typename T::tensor_reduced tensor_reduced;
// enum { TensorLevel = T::TensorLevel };
enum { TensorLevel = T::TensorLevel };
};
//////////////////////////////////////////////////////////////////////////////////
@ -21,28 +35,28 @@ namespace Grid {
typedef RealF scalar_type;
typedef RealF vector_type;
typedef RealF tensor_reduced ;
// enum { TensorLevel = 0 };
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<RealD> {
public:
typedef RealD scalar_type;
typedef RealD vector_type;
typedef RealD tensor_reduced;
// enum { TensorLevel = 0 };
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<ComplexF> {
public:
typedef ComplexF scalar_type;
typedef ComplexF vector_type;
typedef ComplexF tensor_reduced;
// enum { TensorLevel = 0 };
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<ComplexD> {
public:
typedef ComplexD scalar_type;
typedef ComplexD vector_type;
typedef ComplexD tensor_reduced;
// enum { TensorLevel = 0 };
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<vRealF> {
@ -50,44 +64,37 @@ namespace Grid {
typedef RealF scalar_type;
typedef vRealF vector_type;
typedef vRealF tensor_reduced;
// enum { TensorLevel = 0 };
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<vRealD> {
public:
typedef RealD scalar_type;
typedef vRealD vector_type;
typedef vRealD tensor_reduced;
//enum { TensorLevel = 0 };
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<vComplexF> {
public:
typedef ComplexF scalar_type;
typedef vComplexF vector_type;
typedef vComplexF tensor_reduced;
//enum { TensorLevel = 0 };
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<vComplexD> {
public:
typedef ComplexD scalar_type;
typedef vComplexD vector_type;
typedef vComplexD tensor_reduced;
//enum { TensorLevel = 0 };
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<vInteger> {
public:
typedef Integer scalar_type;
typedef vInteger vector_type;
typedef vInteger tensor_reduced;
//enum { TensorLevel = 0 };
enum { TensorLevel = 0 };
};
// Again terminate the recursion.
inline vRealD TensorRemove(vRealD arg){ return arg;}
inline vRealF TensorRemove(vRealF arg){ return arg;}
inline vComplexF TensorRemove(vComplexF arg){ return arg;}
inline vComplexD TensorRemove(vComplexD arg){ return arg;}
inline vInteger TensorRemove(vInteger arg){ return arg;}
}
#endif

View File

@ -2,15 +2,76 @@
#define GRID_MATH_TYPES_H
#include <Grid_math_type_mapper.h>
#include <type_traits>
namespace Grid {
// First some of my own traits
template<typename T> struct isGridTensor {
static const bool value = true;
static const bool notvalue = false;
};
template<> struct isGridTensor<RealD > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<RealF > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<ComplexD > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<ComplexF > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<Integer > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<vRealD > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<vRealF > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<vComplexD > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<vComplexF > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<vInteger > {
static const bool value = false;
static const bool notvalue = true;
};
// Match the index
template<typename T,int Level> struct matchGridTensorIndex {
static const bool value = (Level==T::TensorLevel);
static const bool notvalue = (Level!=T::TensorLevel);
};
///////////////////////////////////////////////////
// Scalar, Vector, Matrix objects.
// These can be composed to form tensor products of internal indices.
///////////////////////////////////////////////////
// Terminates the recursion for temoval of all Grids tensors
inline vRealD TensorRemove(vRealD arg){ return arg;}
inline vRealF TensorRemove(vRealF arg){ return arg;}
inline vComplexF TensorRemove(vComplexF arg){ return arg;}
inline vComplexD TensorRemove(vComplexD arg){ return arg;}
inline vInteger TensorRemove(vInteger arg){ return arg;}
template<class vtype> class iScalar
{
public:
@ -19,9 +80,12 @@ public:
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
//enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
typedef iScalar<tensor_reduced_v> tensor_reduced;
enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
// Scalar no action
// template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
iScalar(){};
@ -79,9 +143,9 @@ public:
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
// enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
typedef iScalar<tensor_reduced_v> tensor_reduced;
enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
typedef iScalar<tensor_reduced_v> tensor_reduced;
iVector(Zero &z){ *this = zero; };
iVector() {};
@ -137,8 +201,10 @@ public:
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
// enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
typedef iScalar<tensor_reduced_v> tensor_reduced;
iMatrix(Zero &z){ *this = zero; };
@ -1018,6 +1084,137 @@ template<class vtype,int N> inline iMatrix<vtype,N> adj(const iMatrix<vtype,N> &
}}
return ret;
}
/////////////////////////////////////////////////////////////////
// Transpose
/////////////////////////////////////////////////////////////////
template<class vtype,int N>
inline typename std::enable_if<isGridTensor<vtype>::value, iMatrix<vtype,N> >::type
transpose(iMatrix<vtype,N> arg)
{
iMatrix<vtype,N> ret;
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
ret._internal[i][j] = transpose(arg._internal[j][i]); // NB recurses
}}
return ret;
}
template<class vtype,int N>
inline typename std::enable_if<isGridTensor<vtype>::notvalue, iMatrix<vtype,N> >::type
transpose(iMatrix<vtype,N> arg)
{
iMatrix<vtype,N> ret;
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
ret._internal[i][j] = arg._internal[j][i]; // Stop recursion if not a tensor type
}}
return ret;
}
template<class vtype,int N>
inline typename std::enable_if<isGridTensor<vtype>::value, iScalar<vtype> >::type
transpose(iScalar<vtype> arg)
{
iScalar<vtype> ret;
ret._internal = transpose(arg._internal); // NB recurses
return ret;
}
template<class vtype,int N>
inline typename std::enable_if<isGridTensor<vtype>::notvalue, iScalar<vtype> >::type
transpose(iScalar<vtype> arg)
{
iScalar<vtype> ret;
ret._internal = arg._internal; // NB recursion stops
return ret;
}
/*
* No need to implement transpose on the primitive types
* Not sure that this idiom is any more elegant that the trace idiom below however!
inline ComplexD transpose(ComplexD &rhs){ return rhs;}
inline ComplexF transpose(ComplexF &rhs){ return rhs;}
inline RealD transpose(RealD &rhs){ return rhs;}
inline RealF transpose(RealF &rhs){ return rhs;}
*/
////////////////////////////////////////////////////////////////////////////////////////////
// Transpose a specific index
////////////////////////////////////////////////////////////////////////////////////////////
template<int Level,class vtype,int N> inline
typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::value, iMatrix<vtype,N> >::type
transposeIndex (const iMatrix<vtype,N> &arg)
{
iMatrix<vtype,N> ret;
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
ret._internal[i][j] = arg._internal[j][i];
}}
return ret;
}
// or not
template<int Level,class vtype,int N> inline
typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue, iMatrix<vtype,N> >::type
transposeIndex (const iMatrix<vtype,N> &arg)
{
iMatrix<vtype,N> ret;
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
ret._internal[i][j] = transposeIndex<Level>(arg._internal[i][j]);
}}
return ret;
}
template<int Level,class vtype> inline
typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue, iScalar<vtype> >::type
transposeIndex (const iScalar<vtype> &arg)
{
return transposeIndex<Level>(arg._internal);
}
////////////////////////////////
// Trace a specific index
////////////////////////////////
template<int Level> inline ComplexF traceIndex(const ComplexF arg) { return arg;}
template<int Level> inline ComplexD traceIndex(const ComplexD arg) { return arg;}
template<int Level> inline RealF traceIndex(const RealF arg) { return arg;}
template<int Level> inline RealD traceIndex(const RealD arg) { return arg;}
template<int Level,class vtype> inline
auto traceIndex(const iScalar<vtype> &arg) -> iScalar<decltype(traceIndex<Level>(arg._internal)) >
{
iScalar<decltype(traceIndex<Level>(arg._internal))> ret;
ret._internal = traceIndex<Level>(arg._internal);
return ret;
}
// If we hit the right index, return scalar and trace it with no further recursion
template<int Level,class vtype,int N> inline
auto traceIndex(const iMatrix<vtype,N> &arg) ->
typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::value, // Index matches
iScalar<vtype> >::type // return scalar
{
iScalar<vtype> ret;
zeroit(ret._internal);
for(int i=0;i<N;i++){
ret._internal = ret._internal + arg._internal[i][i];
}
return ret;
}
// not this level, so recurse
template<int Level,class vtype,int N> inline
auto traceIndex(const iMatrix<vtype,N> &arg) ->
typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue,// No index match
iMatrix<decltype(traceIndex<Level>(arg._internal[0][0])),N> >::type // return matrix
{
iMatrix<decltype(traceIndex<Level>(arg._internal[0][0])),N> ret;
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
ret._internal[i][j] = traceIndex<Level>(arg._internal[i][j]);
}}
return ret;
}
/////////////////////////////////////////////////////////////////
// Can only take the real/imag part of scalar objects, since
@ -1075,17 +1272,24 @@ template<class itype,int N> inline auto imag(const iVector<itype,N> &z) -> iVect
// Trace of scalar and matrix
/////////////////////////////////
inline Complex trace( const Complex &arg){
inline ComplexF trace( const ComplexF &arg){
return arg;
}
//inline vComplex trace(const vComplex &arg){
// return arg;
//}
inline ComplexD trace( const ComplexD &arg){
return arg;
}
inline RealF trace( const RealF &arg){
return arg;
}
inline RealD trace( const RealD &arg){
return arg;
}
template<class vtype,int N>
inline auto trace(const iMatrix<vtype,N> &arg) -> iScalar<decltype(trace(arg._internal[0][0]))>
{
iScalar<decltype( trace(arg._internal[0][0] )) > ret;
ZeroIt(ret._internal);
zeroit(ret._internal);
for(int i=0;i<N;i++){
ret._internal=ret._internal+trace(arg._internal[i][i]);
}

View File

@ -12,7 +12,7 @@
////////////////////////////////////////////////////////////////////////
#ifdef SSE2
#ifdef SSE4
#include <pmmintrin.h>
#endif
#if defined(AVX1) || defined (AVX2)
@ -33,6 +33,12 @@ namespace Grid {
inline RealF adj(const RealF & r){ return r; }
inline RealF conj(const RealF & r){ return r; }
inline RealF real(const RealF & r){ return r; }
inline RealD adj(const RealD & r){ return r; }
inline RealD conj(const RealD & r){ return r; }
inline RealD real(const RealD & r){ return r; }
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
@ -60,8 +66,6 @@ namespace Grid {
inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
inline RealD adj(const RealD & r){ return r; } // No-op for real
inline RealD conj(const RealD & r){ return r; }
inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
@ -72,14 +76,14 @@ namespace Grid {
class Zero{};
static Zero zero;
template<class itype> inline void ZeroIt(itype &arg){ arg=zero;};
template<> inline void ZeroIt(ComplexF &arg){ arg=0; };
template<> inline void ZeroIt(ComplexD &arg){ arg=0; };
template<> inline void ZeroIt(RealF &arg){ arg=0; };
template<> inline void ZeroIt(RealD &arg){ arg=0; };
template<class itype> inline void zeroit(itype &arg){ arg=zero;};
template<> inline void zeroit(ComplexF &arg){ arg=0; };
template<> inline void zeroit(ComplexD &arg){ arg=0; };
template<> inline void zeroit(RealF &arg){ arg=0; };
template<> inline void zeroit(RealD &arg){ arg=0; };
#if defined (SSE2)
#if defined (SSE4)
typedef __m128 fvec;
typedef __m128d dvec;
typedef __m128 cvec;
@ -202,7 +206,7 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
case 0: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
#endif
#ifdef SSE2
#ifdef SSE4
case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
#endif

View File

@ -49,7 +49,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_add_pd(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_add_pd(a.v,b.v);
#endif
#ifdef AVX512
@ -67,7 +67,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_sub_pd(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_sub_pd(a.v,b.v);
#endif
#ifdef AVX512
@ -114,7 +114,7 @@ namespace Grid {
ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
ret.v= _mm256_addsub_pd(ymm0,ymm1);
#endif
#ifdef SSE2
#ifdef SSE4
zvec ymm0,ymm1,ymm2;
ymm0 = _mm_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar,
ymm0 = _mm_mul_pd(ymm0,b.v); // ymm0 <- ar bi, ar br
@ -200,14 +200,14 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_pd(ig,rl,ig,rl);
#endif
#ifdef SSE2
ret.v = _mm_set_pd(a,b);
#ifdef SSE4
ret.v = _mm_set_pd(ig,rl);
#endif
#ifdef AVX512
ret.v = _mm512_set_pd(ig,rl,ig,rl,ig,rl,ig,rl);
#endif
#ifdef QPX
ret.v = {a,b,a,b};
ret.v = {ig,rl,ig,rl};
#endif
}
@ -215,7 +215,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_set_pd(a[0].imag(),a[0].real());
#endif
#ifdef AVX512
@ -231,7 +231,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
#if defined (AVX1)|| defined (AVX2)
_mm256_store_pd((double *)a,ret.v);
#endif
#ifdef SSE2
#ifdef SSE4
_mm_store_pd((double *)a,ret.v);
#endif
#ifdef AVX512
@ -257,7 +257,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
__m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
ret.v=_mm256_shuffle_pd(tmp,tmp,0x5);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_addsub_pd(ret.v,in.v);
#endif
#ifdef AVX512

View File

@ -63,7 +63,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_add_ps(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_add_ps(a.v,b.v);
#endif
#ifdef AVX512
@ -81,7 +81,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_sub_ps(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_sub_ps(a.v,b.v);
#endif
#ifdef AVX512
@ -120,7 +120,7 @@ namespace Grid {
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
ret.v= _mm256_addsub_ps(ymm0,ymm1);
#endif
#ifdef SSE2
#ifdef SSE4
cvec ymm0,ymm1,ymm2;
ymm0 = _mm_shuffle_ps(a.v,a.v,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm_mul_ps(ymm0,b.v); // ymm0 <- ar bi, ar br
@ -156,8 +156,8 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
#endif
#ifdef SSE2
ret.v = _mm_set_ps(a[1].imag, a[1].real(),a[0].imag(),a[0].real());
#ifdef SSE4
ret.v = _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
#endif
#ifdef AVX512
ret.v = _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
@ -181,7 +181,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
#if defined (AVX1)|| defined (AVX2)
_mm256_store_ps((float *)a,ret.v);
#endif
#ifdef SSE2
#ifdef SSE4
_mm_store_ps((float *)a,ret.v);
#endif
#ifdef AVX512
@ -201,7 +201,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_ps(b,a,b,a,b,a,b,a);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_set_ps(a,b,a,b);
#endif
#ifdef AVX512
@ -213,7 +213,11 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
}
friend inline ComplexF Reduce(const vComplexF & in)
{
#ifdef SSE4
#error
#endif
#if defined (AVX1) || defined(AVX2)
// FIXME this is inefficient and use
__attribute__ ((aligned(32))) float c_[8];
_mm256_store_ps(c_,in.v);
return ComplexF(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
@ -305,7 +309,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_addsub_ps(ret.v,in.v);
#endif
#ifdef AVX512

View File

@ -48,7 +48,7 @@ namespace Grid {
#if defined (AVX2)
ret.v = _mm256_add_epi32(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_add_epi32(a.v,b.v);
#endif
#ifdef AVX512
@ -78,7 +78,7 @@ namespace Grid {
#if defined (AVX2)
ret.v = _mm256_sub_epi32(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_sub_epi32(a.v,b.v);
#endif
#ifdef AVX512
@ -108,7 +108,7 @@ namespace Grid {
#if defined (AVX2)
ret.v = _mm256_mul_epi32(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_mul_epi32(a.v,b.v);
#endif
#ifdef AVX512
@ -147,7 +147,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set1_epi32(a);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_set1_epi32(a);
#endif
#ifdef AVX512
@ -161,7 +161,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_set_epi32(a[0],a[1],a[2],a[3]);
#endif
#ifdef AVX512
@ -177,8 +177,8 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
_mm256_store_si256((__m256i*)a,ret.v);
#endif
#ifdef SSE2
_mm_store_si128(a,ret.v);
#ifdef SSE4
_mm_store_si128((__m128i *)a,ret.v);
#endif
#ifdef AVX512
_mm512_store_si512(a,ret.v);

View File

@ -24,7 +24,7 @@ namespace Grid {
friend inline vRealD conj(const vRealD &in){ return in; }
friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
#if defined (AVX1) || defined (SSE2)
#if defined (AVX1) || defined (SSE4)
y = a*x+y;
#endif
#ifdef AVX2 // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
@ -55,7 +55,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_add_pd(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_add_pd(a.v,b.v);
#endif
#ifdef AVX512
@ -72,7 +72,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_sub_pd(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_sub_pd(a.v,b.v);
#endif
#ifdef AVX512
@ -90,7 +90,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_mul_pd(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_mul_pd(a.v,b.v);
#endif
#ifdef AVX512
@ -133,7 +133,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_pd(a,a,a,a);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_set_pd(a,a);
#endif
#ifdef AVX512
@ -147,7 +147,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_set_pd(a[0],a[1]);
#endif
#ifdef AVX512
@ -163,7 +163,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
_mm256_store_pd(a,ret.v);
#endif
#ifdef SSE2
#ifdef SSE4
_mm_store_pd(a,ret.v);
#endif
#ifdef AVX512

View File

@ -26,7 +26,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_add_ps(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_add_ps(a.v,b.v);
#endif
#ifdef AVX512
@ -48,7 +48,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_sub_ps(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_sub_ps(a.v,b.v);
#endif
#ifdef AVX512
@ -70,7 +70,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_mul_ps(a.v,b.v);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_mul_ps(a.v,b.v);
#endif
#ifdef AVX512
@ -96,7 +96,7 @@ namespace Grid {
friend inline vRealF conj(const vRealF &in){ return in; }
friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
#if defined (AVX1) || defined (SSE2)
#if defined (AVX1) || defined (SSE4)
y = a*x+y;
#endif
#ifdef AVX2 // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
@ -158,7 +158,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_ps(a,a,a,a,a,a,a,a);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_set_ps(a,a,a,a);
#endif
#ifdef AVX512
@ -175,7 +175,7 @@ namespace Grid {
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
#endif
#ifdef SSE2
#ifdef SSE4
ret.v = _mm_set_ps(a[0],a[1],a[2],a[3]);
#endif
#ifdef AVX512
@ -195,7 +195,7 @@ friend inline void vstore(const vRealF &ret, float *a){
#if defined (AVX1)|| defined (AVX2)
_mm256_store_ps(a,ret.v);
#endif
#ifdef SSE2
#ifdef SSE4
_mm_store_ps(a,ret.v);
#endif
#ifdef AVX512

View File

@ -29,14 +29,14 @@ int main (int argc, char ** argv)
#ifdef AVX512
simd_layout[0] = 1;
simd_layout[1] = 1;
simd_layout[1] = 2;
simd_layout[2] = 2;
simd_layout[3] = 2;
#endif
#if defined (AVX1)|| defined (AVX2)
simd_layout[0] = 1;
simd_layout[1] = 1;
simd_layout[2] = 1;
simd_layout[2] = 2;
simd_layout[3] = 2;
#endif
#if defined (SSE2)