Grid/lib/simd/Old/Grid_vRealD.h

#ifndef GRID_VREALD_H
#define GRID_VREALD_H

namespace Grid {
    class vRealD  {
    public:
        dvec v; // dvec is double precision vector

    public:
	typedef dvec  vector_type;
	typedef RealD scalar_type;

        vRealD()=default;
        vRealD(RealD a){
	  vsplat(*this,a);
	};
        vRealD(Zero &zero){
	  zeroit(*this);
	}
        vRealD & operator = ( Zero & z){
	  vzero(*this);
	  return (*this);
        }

        friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
        friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
        friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
        friend inline vRealD adj(const vRealD &in) { return in; }
        friend inline vRealD conjugate(const vRealD &in){ return in; }
        
        friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
#if defined (AVX1) || defined (SSE4)
            y = a*x+y;
#endif
#ifdef AVX2     // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
            // accelerates multiply accumulate, but not general multiply add
            y.v = _mm256_fmadd_pd(a.v,x.v,y.v);
#endif
#ifdef AVX512
            // here precision of vector are still single
            y.v = _mm512_fmadd_pd(a.v,x.v,y.v);
#endif
#ifdef QPX
            y.v = vec_madd(a.v,x.v,y.v);
#endif
        }
        //////////////////////////////////
        // Initialise to 1,0
        //////////////////////////////////
        friend inline void vone (vRealD &ret){ vsplat(ret,1.0);}
        friend inline void vzero(vRealD &ret){ vsplat(ret,0.0);}
        
        
        ////////////////////////////////////
        // Arithmetic operator overloads +,-,*
        ////////////////////////////////////
        friend inline vRealD operator + (vRealD a, vRealD b)
        {
            vRealD ret;
#if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_add_pd(a.v,b.v);
#endif
#ifdef SSE4
            ret.v = _mm_add_pd(a.v,b.v);
#endif
#ifdef AVX512
            ret.v = _mm512_add_pd(a.v,b.v);
#endif
#ifdef QPX
            ret.v = vec_add(a.v,b.v);
#endif
            return ret;
        };
        friend inline vRealD operator - (vRealD a, vRealD b)
        {
            vRealD ret;
#if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_sub_pd(a.v,b.v);
#endif
#ifdef SSE4
            ret.v = _mm_sub_pd(a.v,b.v);
#endif
#ifdef AVX512
            ret.v = _mm512_sub_pd(a.v,b.v);
#endif
#ifdef QPX
            ret.v = vec_sub(a.v,b.v);
#endif
            return ret;
        };
        
        friend inline vRealD operator * (vRealD a, vRealD b)
        {
            vRealD ret;
#if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_mul_pd(a.v,b.v);
#endif
#ifdef SSE4
            ret.v = _mm_mul_pd(a.v,b.v);
#endif
#ifdef AVX512
            ret.v = _mm512_mul_pd(a.v,b.v);
#endif
#ifdef QPX
            ret.v = vec_mul(a.v,b.v);
#endif
            return ret;
        };

	////////////////////////////////////////////////////////////////////
	// General permute; assumes vector length is same across 
	// all subtypes; may not be a good assumption, but could
	// add the vector width as a template param for BG/Q for example
	////////////////////////////////////////////////////////////////////

	friend inline void permute(vRealD &y,vRealD b,int perm)
	{
	  Gpermute<vRealD>(y,b,perm);
	}
	/*
	friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
	{
	  Gmerge<vRealD,RealD >(y,extracted);
	}
	friend inline void extract(const vRealD &y,std::vector<RealD *> &extracted)
	{
	  Gextract<vRealD,RealD>(y,extracted);
	}
	friend inline void merge(vRealD &y,std::vector<RealD > &extracted)
	{
	  Gmerge<vRealD,RealD >(y,extracted);
	}
	friend inline void extract(const vRealD &y,std::vector<RealD > &extracted)
	{
	  Gextract<vRealD,RealD>(y,extracted);
	}
	*/
        
        friend inline void vsplat(vRealD &ret,double a){
#if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_pd(a,a,a,a);
#endif
#ifdef SSE4
            ret.v = _mm_set_pd(a,a);
#endif
#ifdef AVX512
            ret.v = _mm512_set1_pd(a);
#endif
#ifdef QPX
            ret.v = {a,a,a,a};
#endif
        }
	friend inline void vset(vRealD &ret, double *a){
#if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
#endif
#ifdef SSE4
            ret.v = _mm_set_pd(a[1],a[0]);
#endif
#ifdef AVX512
            ret.v = _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
            // Note v has a0 a1 a2 a3 a4 a5 a6 a7
#endif
#ifdef QPX
            ret.v = {a[0],a[1],a[2],a[3]};
#endif
	}

	friend inline void vstore(const vRealD &ret, double *a){
#if defined (AVX1)|| defined (AVX2)
            _mm256_store_pd(a,ret.v);
#endif
#ifdef SSE4
            _mm_store_pd(a,ret.v);
#endif
#ifdef AVX512
            _mm512_store_pd(a,ret.v);
            // Note v has a7 a6 a5ba4 a3 a2 a1 a0
#endif
#ifdef QPX
	    assert(0);
#endif
	}
        friend inline void vstream(vRealD &out,const vRealD &in){
#if defined (AVX1)|| defined (AVX2)
	  _mm256_stream_pd((double *)&out.v,in.v);
#endif
#ifdef SSE4
	  _mm_stream_pd((double *)&out.v,in.v);
#endif
#ifdef AVX512
	  _mm512_storenrngo_pd((double *)&out.v,in.v);
	  //Note v has a3 a2 a1 a0
#endif
#ifdef QPX
	  assert(0);
#endif
	}
        friend inline void prefetch(const vRealD &v)
        {
            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
        }
        // Unary negation
        friend inline vRealD operator -(const vRealD &r) {
            vRealD ret;
            vzero(ret);
            ret = ret - r;
            return ret;
        }

       friend inline RealD Reduce(const vRealD & in)
       {
	 vRealD v1,v2;
	 union { 
	   dvec v;
	   double f[sizeof(dvec)/sizeof(double)];
	 } conv;
#ifdef SSE4
	 permute(v1,in,0); // sse 128; paired real double
	 v1=v1+in;
#endif
#if defined(AVX1) || defined (AVX2)
	 permute(v1,in,0); // avx 256; quad double
	 v1=v1+in;
	 permute(v2,v1,1); 
	 v1=v1+v2;
#endif
#ifdef AVX512
	 permute(v1,in,0); // avx 512; octo-double
	 v1=v1+in;
	 permute(v2,v1,1); 
	 v1=v1+v2;
	 permute(v2,v1,2); 
	 v1=v1+v2;
#endif
#ifdef QPX
#endif
	 conv.v=v1.v;
	 return conv.f[0];
       }

        // *=,+=,-= operators
        inline vRealD &operator *=(const vRealD &r) {
            *this = (*this)*r;
            return *this;
        }
        inline vRealD &operator +=(const vRealD &r) {
            *this = *this+r;
            return *this;
        }
        inline vRealD &operator -=(const vRealD &r) {
            *this = *this-r;
            return *this;
        }

    public:
        static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
    };

    inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conjugate(l)*r; }
    inline void zeroit(vRealD &z){ vzero(z);}

    inline vRealD outerProduct(const vRealD &l, const vRealD& r)
    {
        return l*r;
    }
    inline vRealD trace(const vRealD &arg){
        return arg;
    }
    inline vRealD real(const vRealD &arg){
        return arg;
    }


}
#endif
Reorganise to keep files smaller 2015-04-18 18:36:48 +01:00			`#ifndef GRID_VREALD_H`
			`#define GRID_VREALD_H`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00
Renamed the namespace to Grid 2015-04-03 05:29:54 +01:00			`namespace Grid {`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`class vRealD {`
Major rework of extract/merge/permute processing debugged and working. 2015-04-06 11:26:24 +01:00			`public:`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`dvec v; // dvec is double precision vector`
Fixing the Checkerboarding cshift. Implemented "fake" communications in preparation for the leap to MPI. 2015-03-29 20:35:37 +01:00
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`public:`
Fixing the Checkerboarding cshift. Implemented "fake" communications in preparation for the leap to MPI. 2015-03-29 20:35:37 +01:00			`typedef dvec vector_type;`
			`typedef RealD scalar_type;`

Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`vRealD()=default;`
"where" and integer comparisons logic implemented for conditional assignment. LatticeCoordinate helper to get global (reduced) coordinate. Some more work of similar type perhaps needed, but the bulk of the required structure for masked array assignment is now in place. 2015-04-09 07:06:03 +01:00			`vRealD(RealD a){`
			`vsplat(*this,a);`
			`};`
Comms and memory benchmarks added 2015-05-03 09:44:47 +01:00			`vRealD(Zero &zero){`
			`zeroit(*this);`
			`}`
Threading support rework. Placed parallel pragmas as macros; implemented deterministic thread reduction in style of BFM. 2015-05-12 07:51:41 +01:00			`vRealD & operator = ( Zero & z){`
			`vzero(*this);`
			`return (*this);`
			`}`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00
			`friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD __restrict__ r) {y = (l) (*r);}`
			`friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD __restrict__ r) {y = (l) - (r);}`
			`friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD __restrict__ r) {y = (l) + (r);}`
			`friend inline vRealD adj(const vRealD &in) { return in; }`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`friend inline vRealD conjugate(const vRealD &in){ return in; }`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00
			`friend inline void mac (vRealD &y,const vRealD a,const vRealD x){`
Improving the trace support to support any index tracing and simplifying implmentation in some ways 2015-04-16 14:47:28 +01:00			`#if defined (AVX1) \|\| defined (SSE4)`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`y = a*x+y;`
			`#endif`
			`#ifdef AVX2 // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3`
			`// accelerates multiply accumulate, but not general multiply add`
			`y.v = _mm256_fmadd_pd(a.v,x.v,y.v);`
			`#endif`
			`#ifdef AVX512`
			`// here precision of vector are still single`
			`y.v = _mm512_fmadd_pd(a.v,x.v,y.v);`
			`#endif`
			`#ifdef QPX`
			`y.v = vec_madd(a.v,x.v,y.v);`
			`#endif`
			`}`
			`//////////////////////////////////`
			`// Initialise to 1,0`
			`//////////////////////////////////`
			`friend inline void vone (vRealD &ret){ vsplat(ret,1.0);}`
			`friend inline void vzero(vRealD &ret){ vsplat(ret,0.0);}`


			`////////////////////////////////////`
			`// Arithmetic operator overloads +,-,*`
			`////////////////////////////////////`
			`friend inline vRealD operator + (vRealD a, vRealD b)`
			`{`
			`vRealD ret;`
			`#if defined (AVX1)\|\| defined (AVX2)`
			`ret.v = _mm256_add_pd(a.v,b.v);`
			`#endif`
Improving the trace support to support any index tracing and simplifying implmentation in some ways 2015-04-16 14:47:28 +01:00			`#ifdef SSE4`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`ret.v = _mm_add_pd(a.v,b.v);`
			`#endif`
			`#ifdef AVX512`
			`ret.v = _mm512_add_pd(a.v,b.v);`
			`#endif`
			`#ifdef QPX`
			`ret.v = vec_add(a.v,b.v);`
			`#endif`
			`return ret;`
			`};`
			`friend inline vRealD operator - (vRealD a, vRealD b)`
			`{`
			`vRealD ret;`
			`#if defined (AVX1)\|\| defined (AVX2)`
			`ret.v = _mm256_sub_pd(a.v,b.v);`
			`#endif`
Improving the trace support to support any index tracing and simplifying implmentation in some ways 2015-04-16 14:47:28 +01:00			`#ifdef SSE4`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`ret.v = _mm_sub_pd(a.v,b.v);`
			`#endif`
			`#ifdef AVX512`
			`ret.v = _mm512_sub_pd(a.v,b.v);`
			`#endif`
			`#ifdef QPX`
			`ret.v = vec_sub(a.v,b.v);`
			`#endif`
			`return ret;`
			`};`

			`friend inline vRealD operator * (vRealD a, vRealD b)`
			`{`
			`vRealD ret;`
			`#if defined (AVX1)\|\| defined (AVX2)`
			`ret.v = _mm256_mul_pd(a.v,b.v);`
			`#endif`
Improving the trace support to support any index tracing and simplifying implmentation in some ways 2015-04-16 14:47:28 +01:00			`#ifdef SSE4`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`ret.v = _mm_mul_pd(a.v,b.v);`
			`#endif`
			`#ifdef AVX512`
			`ret.v = _mm512_mul_pd(a.v,b.v);`
			`#endif`
			`#ifdef QPX`
			`ret.v = vec_mul(a.v,b.v);`
			`#endif`
			`return ret;`
			`};`
Fixing the Checkerboarding cshift. Implemented "fake" communications in preparation for the leap to MPI. 2015-03-29 20:35:37 +01:00
Major rework of extract/merge/permute processing debugged and working. 2015-04-06 11:26:24 +01:00			`////////////////////////////////////////////////////////////////////`
			`// General permute; assumes vector length is same across`
			`// all subtypes; may not be a good assumption, but could`
			`// add the vector width as a template param for BG/Q for example`
			`////////////////////////////////////////////////////////////////////`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00
Major rework of extract/merge/permute processing debugged and working. 2015-04-06 11:26:24 +01:00			`friend inline void permute(vRealD &y,vRealD b,int perm)`
			`{`
			`Gpermute<vRealD>(y,b,perm);`
			`}`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`/*`
Major rework of extract/merge/permute processing debugged and working. 2015-04-06 11:26:24 +01:00			`friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)`
			`{`
			`Gmerge<vRealD,RealD >(y,extracted);`
			`}`
"where" and integer comparisons logic implemented for conditional assignment. LatticeCoordinate helper to get global (reduced) coordinate. Some more work of similar type perhaps needed, but the bulk of the required structure for masked array assignment is now in place. 2015-04-09 07:06:03 +01:00			`friend inline void extract(const vRealD &y,std::vector<RealD *> &extracted)`
			`{`
			`Gextract<vRealD,RealD>(y,extracted);`
			`}`
			`friend inline void merge(vRealD &y,std::vector<RealD > &extracted)`
			`{`
			`Gmerge<vRealD,RealD >(y,extracted);`
			`}`
			`friend inline void extract(const vRealD &y,std::vector<RealD > &extracted)`
Major rework of extract/merge/permute processing debugged and working. 2015-04-06 11:26:24 +01:00			`{`
			`Gextract<vRealD,RealD>(y,extracted);`
			`}`
Shaken out stencil to the point where I think wilson dslash is correct. Need to audit code carefully, consolidate between stencil and cshift, and then benchmark and optimise. 2015-04-28 08:11:59 +01:00			`*/`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00
			`friend inline void vsplat(vRealD &ret,double a){`
			`#if defined (AVX1)\|\| defined (AVX2)`
			`ret.v = _mm256_set_pd(a,a,a,a);`
			`#endif`
Improving the trace support to support any index tracing and simplifying implmentation in some ways 2015-04-16 14:47:28 +01:00			`#ifdef SSE4`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`ret.v = _mm_set_pd(a,a);`
			`#endif`
			`#ifdef AVX512`
			`ret.v = _mm512_set1_pd(a);`
			`#endif`
			`#ifdef QPX`
			`ret.v = {a,a,a,a};`
			`#endif`
			`}`
			`friend inline void vset(vRealD &ret, double *a){`
			`#if defined (AVX1)\|\| defined (AVX2)`
			`ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);`
			`#endif`
Improving the trace support to support any index tracing and simplifying implmentation in some ways 2015-04-16 14:47:28 +01:00			`#ifdef SSE4`
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`ret.v = _mm_set_pd(a[1],a[0]);`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`#endif`
			`#ifdef AVX512`
			`ret.v = _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);`
			`// Note v has a0 a1 a2 a3 a4 a5 a6 a7`
			`#endif`
			`#ifdef QPX`
			`ret.v = {a[0],a[1],a[2],a[3]};`
			`#endif`
			`}`

"where" and integer comparisons logic implemented for conditional assignment. LatticeCoordinate helper to get global (reduced) coordinate. Some more work of similar type perhaps needed, but the bulk of the required structure for masked array assignment is now in place. 2015-04-09 07:06:03 +01:00			`friend inline void vstore(const vRealD &ret, double *a){`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`#if defined (AVX1)\|\| defined (AVX2)`
			`_mm256_store_pd(a,ret.v);`
			`#endif`
Improving the trace support to support any index tracing and simplifying implmentation in some ways 2015-04-16 14:47:28 +01:00			`#ifdef SSE4`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`_mm_store_pd(a,ret.v);`
			`#endif`
			`#ifdef AVX512`
			`_mm512_store_pd(a,ret.v);`
			`// Note v has a7 a6 a5ba4 a3 a2 a1 a0`
			`#endif`
			`#ifdef QPX`
Clean up but no major changes 2015-04-03 22:54:13 +01:00			`assert(0);`
Added streaming stores 2015-05-05 18:09:28 +01:00			`#endif`
			`}`
			`friend inline void vstream(vRealD &out,const vRealD &in){`
			`#if defined (AVX1)\|\| defined (AVX2)`
			`_mm256_stream_pd((double *)&out.v,in.v);`
			`#endif`
			`#ifdef SSE4`
			`_mm_stream_pd((double *)&out.v,in.v);`
			`#endif`
			`#ifdef AVX512`
Lots of changes required to compile for MIC under ICPC 2015-05-10 23:29:21 +01:00			`_mm512_storenrngo_pd((double *)&out.v,in.v);`
Added streaming stores 2015-05-05 18:09:28 +01:00			`//Note v has a3 a2 a1 a0`
			`#endif`
			`#ifdef QPX`
			`assert(0);`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`#endif`
			`}`
Wilson perf improvements with Gauge prefetching 2015-05-06 06:37:21 +01:00			`friend inline void prefetch(const vRealD &v)`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`{`
			`_mm_prefetch((const char*)&v.v,_MM_HINT_T0);`
			`}`
			`// Unary negation`
			`friend inline vRealD operator -(const vRealD &r) {`
			`vRealD ret;`
			`vzero(ret);`
			`ret = ret - r;`
			`return ret;`
			`}`

			`friend inline RealD Reduce(const vRealD & in)`
			`{`
Reworking to keep intel compiler happy 2015-05-19 21:29:07 +01:00			`vRealD v1,v2;`
			`union {`
			`dvec v;`
			`double f[sizeof(dvec)/sizeof(double)];`
			`} conv;`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`#ifdef SSE4`
			`permute(v1,in,0); // sse 128; paired real double`
			`v1=v1+in;`
			`#endif`
			`#if defined(AVX1) \|\| defined (AVX2)`
			`permute(v1,in,0); // avx 256; quad double`
			`v1=v1+in;`
			`permute(v2,v1,1);`
			`v1=v1+v2;`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`#endif`
			`#ifdef AVX512`
Reworking to keep intel compiler happy 2015-05-19 21:29:07 +01:00			`permute(v1,in,0); // avx 512; octo-double`
			`v1=v1+in;`
			`permute(v2,v1,1);`
			`v1=v1+v2;`
			`permute(v2,v1,2);`
			`v1=v1+v2;`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`#endif`
			`#ifdef QPX`
			`#endif`
Reworking to keep intel compiler happy 2015-05-19 21:29:07 +01:00			`conv.v=v1.v;`
			`return conv.f[0];`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`}`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00
			`// *=,+=,-= operators`
			`inline vRealD &operator *=(const vRealD &r) {`
			`this = (this)*r;`
			`return *this;`
			`}`
			`inline vRealD &operator +=(const vRealD &r) {`
			`this = this+r;`
			`return *this;`
			`}`
			`inline vRealD &operator -=(const vRealD &r) {`
			`this = this-r;`
			`return *this;`
			`}`

			`public:`
			`static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}`
			`};`

Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conjugate(l)*r; }`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00			`inline void zeroit(vRealD &z){ vzero(z);}`

			`inline vRealD outerProduct(const vRealD &l, const vRealD& r)`
			`{`
			`return l*r;`
			`}`
Some bug fixes 2015-04-14 23:20:16 +01:00			`inline vRealD trace(const vRealD &arg){`
			`return arg;`
			`}`
peekIndex update 2015-04-18 14:36:01 +01:00			`inline vRealD real(const vRealD &arg){`
			`return arg;`
			`}`
Initial commit of Grid to GitHub 2015-03-04 03:12:19 +00:00

			`}`
			`#endif`