Grid/lib/simd/Grid_vRealD.h

#ifndef GRID_VREALD_H
#define GRID_VREALD_H

namespace Grid {
    class vRealD  {
    public:
        dvec v; // dvec is double precision vector

    public:
	typedef dvec  vector_type;
	typedef RealD scalar_type;

        vRealD()=default;
        vRealD(RealD a){
	  vsplat(*this,a);
	};
        vRealD(Zero &zero){
	  zeroit(*this);
	}

        friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
        friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
        friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
        friend inline vRealD adj(const vRealD &in) { return in; }
        friend inline vRealD conj(const vRealD &in){ return in; }
        
        friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
#if defined (AVX1) || defined (SSE4)
            y = a*x+y;
#endif
#ifdef AVX2     // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
            // accelerates multiply accumulate, but not general multiply add
            y.v = _mm256_fmadd_pd(a.v,x.v,y.v);
#endif
#ifdef AVX512
            // here precision of vector are still single
            y.v = _mm512_fmadd_pd(a.v,x.v,y.v);
#endif
#ifdef QPX
            y.v = vec_madd(a.v,x.v,y.v);
#endif
        }
        //////////////////////////////////
        // Initialise to 1,0
        //////////////////////////////////
        friend inline void vone (vRealD &ret){ vsplat(ret,1.0);}
        friend inline void vzero(vRealD &ret){ vsplat(ret,0.0);}
        
        
        ////////////////////////////////////
        // Arithmetic operator overloads +,-,*
        ////////////////////////////////////
        friend inline vRealD operator + (vRealD a, vRealD b)
        {
            vRealD ret;
#if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_add_pd(a.v,b.v);
#endif
#ifdef SSE4
            ret.v = _mm_add_pd(a.v,b.v);
#endif
#ifdef AVX512
            ret.v = _mm512_add_pd(a.v,b.v);
#endif
#ifdef QPX
            ret.v = vec_add(a.v,b.v);
#endif
            return ret;
        };
        friend inline vRealD operator - (vRealD a, vRealD b)
        {
            vRealD ret;
#if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_sub_pd(a.v,b.v);
#endif
#ifdef SSE4
            ret.v = _mm_sub_pd(a.v,b.v);
#endif
#ifdef AVX512
            ret.v = _mm512_sub_pd(a.v,b.v);
#endif
#ifdef QPX
            ret.v = vec_sub(a.v,b.v);
#endif
            return ret;
        };
        
        friend inline vRealD operator * (vRealD a, vRealD b)
        {
            vRealD ret;
#if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_mul_pd(a.v,b.v);
#endif
#ifdef SSE4
            ret.v = _mm_mul_pd(a.v,b.v);
#endif
#ifdef AVX512
            ret.v = _mm512_mul_pd(a.v,b.v);
#endif
#ifdef QPX
            ret.v = vec_mul(a.v,b.v);
#endif
            return ret;
        };

	////////////////////////////////////////////////////////////////////
	// General permute; assumes vector length is same across 
	// all subtypes; may not be a good assumption, but could
	// add the vector width as a template param for BG/Q for example
	////////////////////////////////////////////////////////////////////
	/*
	friend inline void permute(vRealD &y,vRealD b,int perm)
	{
	  Gpermute<vRealD>(y,b,perm);
	}
	friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
	{
	  Gmerge<vRealD,RealD >(y,extracted);
	}
	friend inline void extract(const vRealD &y,std::vector<RealD *> &extracted)
	{
	  Gextract<vRealD,RealD>(y,extracted);
	}
	friend inline void merge(vRealD &y,std::vector<RealD > &extracted)
	{
	  Gmerge<vRealD,RealD >(y,extracted);
	}
	friend inline void extract(const vRealD &y,std::vector<RealD > &extracted)
	{
	  Gextract<vRealD,RealD>(y,extracted);
	}
	*/
        
        friend inline void vsplat(vRealD &ret,double a){
#if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_pd(a,a,a,a);
#endif
#ifdef SSE4
            ret.v = _mm_set_pd(a,a);
#endif
#ifdef AVX512
            ret.v = _mm512_set1_pd(a);
#endif
#ifdef QPX
            ret.v = {a,a,a,a};
#endif
        }
	friend inline void vset(vRealD &ret, double *a){
#if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
#endif
#ifdef SSE4
            ret.v = _mm_set_pd(a[1],a[0]);
#endif
#ifdef AVX512
            ret.v = _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
            // Note v has a0 a1 a2 a3 a4 a5 a6 a7
#endif
#ifdef QPX
            ret.v = {a[0],a[1],a[2],a[3]};
#endif
	}

	friend inline void vstore(const vRealD &ret, double *a){
#if defined (AVX1)|| defined (AVX2)
            _mm256_store_pd(a,ret.v);
#endif
#ifdef SSE4
            _mm_store_pd(a,ret.v);
#endif
#ifdef AVX512
            _mm512_store_pd(a,ret.v);
            // Note v has a7 a6 a5ba4 a3 a2 a1 a0
#endif
#ifdef QPX
	    assert(0);
#endif
	}
        friend inline void vstream(vRealD &out,const vRealD &in){
#if defined (AVX1)|| defined (AVX2)
	  _mm256_stream_pd((double *)&out.v,in.v);
#endif
#ifdef SSE4
	  _mm_stream_pd((double *)&out.v,in.v);
#endif
#ifdef AVX512
	  _mm512_stream_pd((double *)&out.v,in.v);
	  //Note v has a3 a2 a1 a0
#endif
#ifdef QPX
	  assert(0);
#endif
	}
        friend inline void prefetch(const vRealD &v)
        {
            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
        }
        // Unary negation
        friend inline vRealD operator -(const vRealD &r) {
            vRealD ret;
            vzero(ret);
            ret = ret - r;
            return ret;
        }

       friend inline RealD Reduce(const vRealD & in)
       {
#if defined (SSE4)
	 // FIXME Hack
	 const RealD * ptr =(const RealD *)  &in;
	 RealD ret = 0; 
	 for(int i=0;i<vRealD::Nsimd();i++){
	   ret = ret+ptr[i];
	 }
	 return ret;
#endif
#if defined (AVX1) || defined(AVX2)
	 typedef union  {
	   uint64_t l;
	   double   d;
	 } my_conv_t;
	 my_conv_t converter;
// more reduce_add
/*
            __attribute__ ((aligned(32))) double c_[16];
	    __m256d tmp  = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
            __m256d hadd = _mm256_hadd_pd(in.v,tmp);              // hadd = 1+0,3+2,3+2,1+0
  	             tmp = _mm256_permute2f128_pd(hadd,hadd,0x01);// tmp  = 3+2,1+0,1+0,3+2
                    hadd = _mm256_hadd_pd(tmp,tmp);               // tmp  = 3+2+1+0,3+2+1+0,1+0+3+2,1+0+3+2
                    _mm256_store_pd(c_,hadd);<EFBFBD>
             return c[0]
*/
	    __m256d tmp  = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
            __m256d hadd = _mm256_hadd_pd(in.v,tmp);              // hadd = 1+0,3+2,3+2,1+0
                    hadd = _mm256_hadd_pd(hadd,hadd);             // hadd = 1+0+3+2...
	    converter.l = _mm256_extract_epi64(hadd,0);
            return converter.d;
#endif
#ifdef AVX512
            return _mm512_reduce_add_pd(in.v);
/*
            __attribute__ ((aligned(32))) double c_[8];
           _mm512_store_pd(c_,in.v);
            return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7];
*/
#endif
#ifdef QPX
#endif
        }

        // *=,+=,-= operators
        inline vRealD &operator *=(const vRealD &r) {
            *this = (*this)*r;
            return *this;
        }
        inline vRealD &operator +=(const vRealD &r) {
            *this = *this+r;
            return *this;
        }
        inline vRealD &operator -=(const vRealD &r) {
            *this = *this-r;
            return *this;
        }

    public:
        static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
    };

    inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
    inline void zeroit(vRealD &z){ vzero(z);}

    inline vRealD outerProduct(const vRealD &l, const vRealD& r)
    {
        return l*r;
    }
    inline vRealD trace(const vRealD &arg){
        return arg;
    }
    inline vRealD real(const vRealD &arg){
        return arg;
    }


}
#endif
-												Reorganise to keep files smaller

											
										
										
											2015-04-18 18:36:48 +01:00
+								#ifndef GRID_VREALD_H
 								#define GRID_VREALD_H
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
-												Renamed the namespace to Grid

											
										
										
											2015-04-03 05:29:54 +01:00
+								namespace Grid {
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								    class vRealD  {
-												Major rework of extract/merge/permute processing debugged and working.

											
										
										
											2015-04-06 11:26:24 +01:00
+								    public:
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								        dvec v; // dvec is double precision vector
-												Fixing the Checkerboarding cshift.
Implemented "fake" communications in preparation for the leap to MPI.

											
										
										
											2015-03-29 20:35:37 +01:00
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								    public:
-												Fixing the Checkerboarding cshift.
Implemented "fake" communications in preparation for the leap to MPI.

											
										
										
											2015-03-29 20:35:37 +01:00
+									typedef dvec  vector_type;
 									typedef RealD scalar_type;
-												Comms and memory benchmarks added

											
										
										
											2015-05-03 09:44:47 +01:00
+								        vRealD()=default;
-												"where" and integer comparisons logic implemented for conditional
assignment. LatticeCoordinate helper to get global (reduced) coordinate.

Some more work of similar type perhaps needed, but the bulk of the required
structure for masked array assignment is now in place.

											
										
										
											2015-04-09 07:06:03 +01:00
+								        vRealD(RealD a){
 									  vsplat(*this,a);
 									};
-												Comms and memory benchmarks added

											
										
										
											2015-05-03 09:44:47 +01:00
+								        vRealD(Zero &zero){
 									  zeroit(*this);
 									}
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
 								        friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
 								        friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
 								        friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
 								        friend inline vRealD adj(const vRealD &in) { return in; }
 								        friend inline vRealD conj(const vRealD &in){ return in; }
 								        friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
-												Improving the trace support to support any index tracing and simplifying
implmentation in some ways

											
										
										
											2015-04-16 14:47:28 +01:00
+								#if defined (AVX1) || defined (SSE4)
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								            y = a*x+y;
 								#endif
 								#ifdef AVX2     // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
 								            // accelerates multiply accumulate, but not general multiply add
 								            y.v = _mm256_fmadd_pd(a.v,x.v,y.v);
 								#endif
 								#ifdef AVX512
 								            // here precision of vector are still single
 								            y.v = _mm512_fmadd_pd(a.v,x.v,y.v);
 								#endif
 								#ifdef QPX
 								            y.v = vec_madd(a.v,x.v,y.v);
 								#endif
 								        }
 								        //////////////////////////////////
 								        // Initialise to 1,0
 								        //////////////////////////////////
 								        friend inline void vone (vRealD &ret){ vsplat(ret,1.0);}
 								        friend inline void vzero(vRealD &ret){ vsplat(ret,0.0);}
 								        ////////////////////////////////////
 								        // Arithmetic operator overloads +,-,*
 								        ////////////////////////////////////
 								        friend inline vRealD operator + (vRealD a, vRealD b)
 								        {
 								            vRealD ret;
 								#if defined (AVX1)|| defined (AVX2)
 								            ret.v = _mm256_add_pd(a.v,b.v);
 								#endif
-												Improving the trace support to support any index tracing and simplifying
implmentation in some ways

											
										
										
											2015-04-16 14:47:28 +01:00
+								#ifdef SSE4
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								            ret.v = _mm_add_pd(a.v,b.v);
 								#endif
 								#ifdef AVX512
 								            ret.v = _mm512_add_pd(a.v,b.v);
 								#endif
 								#ifdef QPX
 								            ret.v = vec_add(a.v,b.v);
 								#endif
 								            return ret;
 								        };
 								        friend inline vRealD operator - (vRealD a, vRealD b)
 								        {
 								            vRealD ret;
 								#if defined (AVX1)|| defined (AVX2)
 								            ret.v = _mm256_sub_pd(a.v,b.v);
 								#endif
-												Improving the trace support to support any index tracing and simplifying
implmentation in some ways

											
										
										
											2015-04-16 14:47:28 +01:00
+								#ifdef SSE4
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								            ret.v = _mm_sub_pd(a.v,b.v);
 								#endif
 								#ifdef AVX512
 								            ret.v = _mm512_sub_pd(a.v,b.v);
 								#endif
 								#ifdef QPX
 								            ret.v = vec_sub(a.v,b.v);
 								#endif
 								            return ret;
 								        };
 								        friend inline vRealD operator * (vRealD a, vRealD b)
 								        {
 								            vRealD ret;
 								#if defined (AVX1)|| defined (AVX2)
 								            ret.v = _mm256_mul_pd(a.v,b.v);
 								#endif
-												Improving the trace support to support any index tracing and simplifying
implmentation in some ways

											
										
										
											2015-04-16 14:47:28 +01:00
+								#ifdef SSE4
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								            ret.v = _mm_mul_pd(a.v,b.v);
 								#endif
 								#ifdef AVX512
 								            ret.v = _mm512_mul_pd(a.v,b.v);
 								#endif
 								#ifdef QPX
 								            ret.v = vec_mul(a.v,b.v);
 								#endif
 								            return ret;
 								        };
-												Fixing the Checkerboarding cshift.
Implemented "fake" communications in preparation for the leap to MPI.

											
										
										
											2015-03-29 20:35:37 +01:00
-												Major rework of extract/merge/permute processing debugged and working.

											
										
										
											2015-04-06 11:26:24 +01:00
+									////////////////////////////////////////////////////////////////////
 									// General permute; assumes vector length is same across
 									// all subtypes; may not be a good assumption, but could
 									// add the vector width as a template param for BG/Q for example
 									////////////////////////////////////////////////////////////////////
-												Shaken out stencil to the point where I think wilson dslash is correct.
Need to audit code carefully, consolidate between stencil and cshift,
and then benchmark and optimise.

											
										
										
											2015-04-28 08:11:59 +01:00
+									/*
-												Major rework of extract/merge/permute processing debugged and working.

											
										
										
											2015-04-06 11:26:24 +01:00
+									friend inline void permute(vRealD &y,vRealD b,int perm)
 									{
 									  Gpermute<vRealD>(y,b,perm);
 									}
 									friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
 									{
 									  Gmerge<vRealD,RealD >(y,extracted);
 									}
-												"where" and integer comparisons logic implemented for conditional
assignment. LatticeCoordinate helper to get global (reduced) coordinate.

Some more work of similar type perhaps needed, but the bulk of the required
structure for masked array assignment is now in place.

											
										
										
											2015-04-09 07:06:03 +01:00
+									friend inline void extract(const vRealD &y,std::vector<RealD *> &extracted)
 									{
 									  Gextract<vRealD,RealD>(y,extracted);
 									}
 									friend inline void merge(vRealD &y,std::vector<RealD > &extracted)
 									{
 									  Gmerge<vRealD,RealD >(y,extracted);
 									}
 									friend inline void extract(const vRealD &y,std::vector<RealD > &extracted)
-												Major rework of extract/merge/permute processing debugged and working.

											
										
										
											2015-04-06 11:26:24 +01:00
+									{
 									  Gextract<vRealD,RealD>(y,extracted);
 									}
-												Shaken out stencil to the point where I think wilson dslash is correct.
Need to audit code carefully, consolidate between stencil and cshift,
and then benchmark and optimise.

											
										
										
											2015-04-28 08:11:59 +01:00
+									*/
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
 								        friend inline void vsplat(vRealD &ret,double a){
 								#if defined (AVX1)|| defined (AVX2)
 								            ret.v = _mm256_set_pd(a,a,a,a);
 								#endif
-												Improving the trace support to support any index tracing and simplifying
implmentation in some ways

											
										
										
											2015-04-16 14:47:28 +01:00
+								#ifdef SSE4
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								            ret.v = _mm_set_pd(a,a);
 								#endif
 								#ifdef AVX512
 								            ret.v = _mm512_set1_pd(a);
 								#endif
 								#ifdef QPX
 								            ret.v = {a,a,a,a};
 								#endif
 								        }
 									friend inline void vset(vRealD &ret, double *a){
 								#if defined (AVX1)|| defined (AVX2)
 								            ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
 								#endif
-												Improving the trace support to support any index tracing and simplifying
implmentation in some ways

											
										
										
											2015-04-16 14:47:28 +01:00
+								#ifdef SSE4
-												Big updates with progress towards wilson matrix

											
										
										
											2015-04-26 15:51:09 +01:00
+								            ret.v = _mm_set_pd(a[1],a[0]);
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								#endif
 								#ifdef AVX512
 								            ret.v = _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
 								            // Note v has a0 a1 a2 a3 a4 a5 a6 a7
 								#endif
 								#ifdef QPX
 								            ret.v = {a[0],a[1],a[2],a[3]};
 								#endif
 									}
-												"where" and integer comparisons logic implemented for conditional
assignment. LatticeCoordinate helper to get global (reduced) coordinate.

Some more work of similar type perhaps needed, but the bulk of the required
structure for masked array assignment is now in place.

											
										
										
											2015-04-09 07:06:03 +01:00
+									friend inline void vstore(const vRealD &ret, double *a){
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								#if defined (AVX1)|| defined (AVX2)
 								            _mm256_store_pd(a,ret.v);
 								#endif
-												Improving the trace support to support any index tracing and simplifying
implmentation in some ways

											
										
										
											2015-04-16 14:47:28 +01:00
+								#ifdef SSE4
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								            _mm_store_pd(a,ret.v);
 								#endif
 								#ifdef AVX512
 								            _mm512_store_pd(a,ret.v);
 								            // Note v has a7 a6 a5ba4 a3 a2 a1 a0
 								#endif
 								#ifdef QPX
-												Clean up but no major changes

											
										
										
											2015-04-03 22:54:13 +01:00
+									    assert(0);
-												Added streaming stores

											
										
										
											2015-05-05 18:09:28 +01:00
+								#endif
 									}
 								        friend inline void vstream(vRealD &out,const vRealD &in){
 								#if defined (AVX1)|| defined (AVX2)
 									  _mm256_stream_pd((double *)&out.v,in.v);
 								#endif
 								#ifdef SSE4
 									  _mm_stream_pd((double *)&out.v,in.v);
 								#endif
 								#ifdef AVX512
 									  _mm512_stream_pd((double *)&out.v,in.v);
 									  //Note v has a3 a2 a1 a0
 								#endif
 								#ifdef QPX
 									  assert(0);
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								#endif
 									}
-												Wilson perf improvements with Gauge prefetching

											
										
										
											2015-05-06 06:37:21 +01:00
+								        friend inline void prefetch(const vRealD &v)
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								        {
 								            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
 								        }
 								        // Unary negation
 								        friend inline vRealD operator -(const vRealD &r) {
 								            vRealD ret;
 								            vzero(ret);
 								            ret = ret - r;
 								            return ret;
 								        }
 								       friend inline RealD Reduce(const vRealD & in)
 								       {
-												Big updates with progress towards wilson matrix

											
										
										
											2015-04-26 15:51:09 +01:00
+								#if defined (SSE4)
 									 // FIXME Hack
 									 const RealD * ptr =(const RealD *)  &in;
 									 RealD ret = 0;
 									 for(int i=0;i<vRealD::Nsimd();i++){
 									   ret = ret+ptr[i];
 									 }
 									 return ret;
 								#endif
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								#if defined (AVX1) || defined(AVX2)
 									 typedef union  {
 									   uint64_t l;
 									   double   d;
 									 } my_conv_t;
 									 my_conv_t converter;
 								// more reduce_add
 								/*
 								            __attribute__ ((aligned(32))) double c_[16];
 									    __m256d tmp  = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
 								            __m256d hadd = _mm256_hadd_pd(in.v,tmp);              // hadd = 1+0,3+2,3+2,1+0
 								  	             tmp = _mm256_permute2f128_pd(hadd,hadd,0x01);// tmp  = 3+2,1+0,1+0,3+2
 								                    hadd = _mm256_hadd_pd(tmp,tmp);               // tmp  = 3+2+1+0,3+2+1+0,1+0+3+2,1+0+3+2
 								                    _mm256_store_pd(c_,hadd);<EFBFBD>
 								             return c[0]
 								*/
 									    __m256d tmp  = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
 								            __m256d hadd = _mm256_hadd_pd(in.v,tmp);              // hadd = 1+0,3+2,3+2,1+0
 								                    hadd = _mm256_hadd_pd(hadd,hadd);             // hadd = 1+0+3+2...
 									    converter.l = _mm256_extract_epi64(hadd,0);
 								            return converter.d;
 								#endif
 								#ifdef AVX512
 								            return _mm512_reduce_add_pd(in.v);
 								/*
 								            __attribute__ ((aligned(32))) double c_[8];
 								           _mm512_store_pd(c_,in.v);
 								            return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7];
 								*/
 								#endif
 								#ifdef QPX
 								#endif
 								        }
 								        // *=,+=,-= operators
 								        inline vRealD &operator *=(const vRealD &r) {
 								            *this = (*this)*r;
 								            return *this;
 								        }
 								        inline vRealD &operator +=(const vRealD &r) {
 								            *this = *this+r;
 								            return *this;
 								        }
 								        inline vRealD &operator -=(const vRealD &r) {
 								            *this = *this-r;
 								            return *this;
 								        }
 								    public:
 								        static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
 								    };
-												peekIndex update

											
										
										
											2015-04-18 14:36:01 +01:00
+								    inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
+								    inline void zeroit(vRealD &z){ vzero(z);}
 								    inline vRealD outerProduct(const vRealD &l, const vRealD& r)
 								    {
 								        return l*r;
 								    }
-												Some bug fixes

											
										
										
											2015-04-14 23:20:16 +01:00
+								    inline vRealD trace(const vRealD &arg){
 								        return arg;
 								    }
-												peekIndex update

											
										
										
											2015-04-18 14:36:01 +01:00
+								    inline vRealD real(const vRealD &arg){
 								        return arg;
 								    }
-												Initial commit of Grid to GitHub

											
										
										
											2015-03-04 03:12:19 +00:00
 								}
 								#endif