mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-30 11:34:32 +00:00 
			
		
		
		
	Merge branch 'develop' into feature/mres_schur
* develop: Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path. Better SIMD usage/coalescence
This commit is contained in:
		| @@ -67,8 +67,13 @@ public: | ||||
|   accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } | ||||
| #endif | ||||
|  | ||||
| #if 1 | ||||
|   //  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; | ||||
|   accelerator_inline vobj       & operator[](size_t i) const { return this->_odata[i]; }; | ||||
| #else | ||||
|   accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; | ||||
|   accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; }; | ||||
| #endif | ||||
|    | ||||
|   accelerator_inline uint64_t begin(void) const { return 0;}; | ||||
|   accelerator_inline uint64_t end(void)   const { return this->_odata_size; }; | ||||
|   | ||||
| @@ -76,6 +76,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
| #define REGISTER | ||||
|  | ||||
| #ifdef GRID_SIMT | ||||
| #define LOAD_CHIMU(ptype)		\ | ||||
|   {const SiteSpinor & ref (in[offset]);	\ | ||||
|     Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm);	\ | ||||
|     Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm);	\ | ||||
|     Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm);	\ | ||||
|     Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm);	\ | ||||
|     Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm);	\ | ||||
|     Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm);	\ | ||||
|     Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm);	\ | ||||
|     Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm);	\ | ||||
|     Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm);	\ | ||||
|     Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm);	\ | ||||
|     Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm);	\ | ||||
|     Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm);	} | ||||
| #define PERMUTE_DIR(dir) ; | ||||
| #else | ||||
| #define LOAD_CHIMU			\ | ||||
|   {const SiteSpinor & ref (in[offset]);	\ | ||||
|     Chimu_00=ref()(0)(0);\ | ||||
| @@ -91,24 +108,33 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|     Chimu_31=ref()(3)(1);\ | ||||
|     Chimu_32=ref()(3)(2);} | ||||
|  | ||||
| #define PERMUTE_DIR(dir)			\ | ||||
|   permute##dir(Chi_00,Chi_00);	\ | ||||
|       permute##dir(Chi_01,Chi_01);\ | ||||
|       permute##dir(Chi_02,Chi_02);\ | ||||
|       permute##dir(Chi_10,Chi_10);	\ | ||||
|       permute##dir(Chi_11,Chi_11);\ | ||||
|       permute##dir(Chi_12,Chi_12); | ||||
|  | ||||
| #endif | ||||
|  | ||||
| #define LOAD_CHI				\ | ||||
|   {const SiteHalfSpinor &ref(buf[offset]);	\ | ||||
|     Chi_00 = ref()(0)(0);\ | ||||
|     Chi_01 = ref()(0)(1);\ | ||||
|     Chi_02 = ref()(0)(2);\ | ||||
|     Chi_10 = ref()(1)(0);\ | ||||
|     Chi_11 = ref()(1)(1);\ | ||||
|     Chi_12 = ref()(1)(2);} | ||||
|     Chi_00 = coalescedRead(ref()(0)(0));	\ | ||||
|     Chi_01 = coalescedRead(ref()(0)(1));	\ | ||||
|     Chi_02 = coalescedRead(ref()(0)(2));	\ | ||||
|     Chi_10 = coalescedRead(ref()(1)(0));	\ | ||||
|     Chi_11 = coalescedRead(ref()(1)(1));	\ | ||||
|     Chi_12 = coalescedRead(ref()(1)(2));} | ||||
|  | ||||
| // To splat or not to splat depends on the implementation | ||||
| #define MULT_2SPIN(A)\ | ||||
|   {auto & ref(U[sU](A));					\ | ||||
|    Impl::loadLinkElement(U_00,ref()(0,0));	\ | ||||
|    Impl::loadLinkElement(U_10,ref()(1,0));	\ | ||||
|    Impl::loadLinkElement(U_20,ref()(2,0));	\ | ||||
|    Impl::loadLinkElement(U_01,ref()(0,1));	\ | ||||
|    Impl::loadLinkElement(U_11,ref()(1,1));	\ | ||||
|    Impl::loadLinkElement(U_21,ref()(2,1));	\ | ||||
|   U_00=coalescedRead(ref()(0,0));				\ | ||||
|   U_10=coalescedRead(ref()(1,0));					\ | ||||
|   U_20=coalescedRead(ref()(2,0));					\ | ||||
|   U_01=coalescedRead(ref()(0,1));					\ | ||||
|   U_11=coalescedRead(ref()(1,1));					\ | ||||
|   U_21=coalescedRead(ref()(2,1));					\ | ||||
|     UChi_00 = U_00*Chi_00;					\ | ||||
|     UChi_10 = U_00*Chi_10;					\ | ||||
|     UChi_01 = U_10*Chi_00;					\ | ||||
| @@ -121,9 +147,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|     UChi_11+= U_11*Chi_11;					\ | ||||
|     UChi_02+= U_21*Chi_01;					\ | ||||
|     UChi_12+= U_21*Chi_11;					\ | ||||
|     Impl::loadLinkElement(U_00,ref()(0,2));	\ | ||||
|     Impl::loadLinkElement(U_10,ref()(1,2));	\ | ||||
|     Impl::loadLinkElement(U_20,ref()(2,2));	\ | ||||
|     U_00=coalescedRead(ref()(0,2));				\ | ||||
|     U_10=coalescedRead(ref()(1,2));				\ | ||||
|     U_20=coalescedRead(ref()(2,2));				\ | ||||
|     UChi_00+= U_00*Chi_02;					\ | ||||
|     UChi_10+= U_00*Chi_12;					\ | ||||
|     UChi_01+= U_10*Chi_02;					\ | ||||
| @@ -131,15 +157,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|     UChi_02+= U_20*Chi_02;					\ | ||||
|     UChi_12+= U_20*Chi_12;} | ||||
|  | ||||
|  | ||||
| #define PERMUTE_DIR(dir)			\ | ||||
|       permute##dir(Chi_00,Chi_00);\ | ||||
|       permute##dir(Chi_01,Chi_01);\ | ||||
|       permute##dir(Chi_02,Chi_02);\ | ||||
|       permute##dir(Chi_10,Chi_10);\ | ||||
|       permute##dir(Chi_11,Chi_11);\ | ||||
|       permute##dir(Chi_12,Chi_12); | ||||
|  | ||||
| //      hspin(0)=fspin(0)+timesI(fspin(3)); | ||||
| //      hspin(1)=fspin(1)+timesI(fspin(2)); | ||||
| #define XP_PROJ \ | ||||
| @@ -359,7 +376,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|   local  = SE->_is_local;			\ | ||||
|   perm   = SE->_permute;			\ | ||||
|   if ( local ) {				\ | ||||
|     LOAD_CHIMU;					\ | ||||
|     LOAD_CHIMU(PERM);				\ | ||||
|     PROJ;					\ | ||||
|     if ( perm) {				\ | ||||
|       PERMUTE_DIR(PERM);			\ | ||||
| @@ -376,7 +393,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|   local  = SE->_is_local;			\ | ||||
|   perm   = SE->_permute;			\ | ||||
|   if ( local ) {				\ | ||||
|     LOAD_CHIMU;					\ | ||||
|     LOAD_CHIMU(PERM);				\ | ||||
|     PROJ;					\ | ||||
|     if ( perm) {				\ | ||||
|       PERMUTE_DIR(PERM);			\ | ||||
| @@ -402,39 +419,38 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| #define HAND_RESULT(ss)				\ | ||||
|   {						\ | ||||
|     SiteSpinor & ref (out[ss]);			\ | ||||
|     vstream(ref()(0)(0),result_00);		\ | ||||
|     vstream(ref()(0)(1),result_01);		\ | ||||
|     vstream(ref()(0)(2),result_02);		\ | ||||
|     vstream(ref()(1)(0),result_10);		\ | ||||
|     vstream(ref()(1)(1),result_11);		\ | ||||
|     vstream(ref()(1)(2),result_12);		\ | ||||
|     vstream(ref()(2)(0),result_20);		\ | ||||
|     vstream(ref()(2)(1),result_21);		\ | ||||
|     vstream(ref()(2)(2),result_22);		\ | ||||
|     vstream(ref()(3)(0),result_30);		\ | ||||
|     vstream(ref()(3)(1),result_31);		\ | ||||
|     vstream(ref()(3)(2),result_32);		\ | ||||
|     coalescedWrite(ref()(0)(0),result_00);		\ | ||||
|     coalescedWrite(ref()(0)(1),result_01);		\ | ||||
|     coalescedWrite(ref()(0)(2),result_02);		\ | ||||
|     coalescedWrite(ref()(1)(0),result_10);		\ | ||||
|     coalescedWrite(ref()(1)(1),result_11);		\ | ||||
|     coalescedWrite(ref()(1)(2),result_12);		\ | ||||
|     coalescedWrite(ref()(2)(0),result_20);		\ | ||||
|     coalescedWrite(ref()(2)(1),result_21);		\ | ||||
|     coalescedWrite(ref()(2)(2),result_22);		\ | ||||
|     coalescedWrite(ref()(3)(0),result_30);		\ | ||||
|     coalescedWrite(ref()(3)(1),result_31);		\ | ||||
|     coalescedWrite(ref()(3)(2),result_32);		\ | ||||
|   } | ||||
|  | ||||
| #define HAND_RESULT_EXT(ss)				\ | ||||
|   if (nmu){					\ | ||||
|   {							\ | ||||
|     SiteSpinor & ref (out[ss]);				\ | ||||
|     ref()(0)(0)+=result_00;		\ | ||||
|     ref()(0)(1)+=result_01;		\ | ||||
|     ref()(0)(2)+=result_02;		\ | ||||
|     ref()(1)(0)+=result_10;		\ | ||||
|     ref()(1)(1)+=result_11;		\ | ||||
|     ref()(1)(2)+=result_12;		\ | ||||
|     ref()(2)(0)+=result_20;		\ | ||||
|     ref()(2)(1)+=result_21;		\ | ||||
|     ref()(2)(2)+=result_22;		\ | ||||
|     ref()(3)(0)+=result_30;		\ | ||||
|     ref()(3)(1)+=result_31;		\ | ||||
|     ref()(3)(2)+=result_32;		\ | ||||
|     coalescedWrite(ref()(0)(0),coalescedRead(ref()(0)(0))+result_00);	\ | ||||
|     coalescedWrite(ref()(0)(1),coalescedRead(ref()(0)(1))+result_01);	\ | ||||
|     coalescedWrite(ref()(0)(2),coalescedRead(ref()(0)(2))+result_02);	\ | ||||
|     coalescedWrite(ref()(1)(0),coalescedRead(ref()(1)(0))+result_10);	\ | ||||
|     coalescedWrite(ref()(1)(1),coalescedRead(ref()(1)(1))+result_11);	\ | ||||
|     coalescedWrite(ref()(1)(2),coalescedRead(ref()(1)(2))+result_12);	\ | ||||
|     coalescedWrite(ref()(2)(0),coalescedRead(ref()(2)(0))+result_20);	\ | ||||
|     coalescedWrite(ref()(2)(1),coalescedRead(ref()(2)(1))+result_21);	\ | ||||
|     coalescedWrite(ref()(2)(2),coalescedRead(ref()(2)(2))+result_22);	\ | ||||
|     coalescedWrite(ref()(3)(0),coalescedRead(ref()(3)(0))+result_30);	\ | ||||
|     coalescedWrite(ref()(3)(1),coalescedRead(ref()(3)(1))+result_31);	\ | ||||
|     coalescedWrite(ref()(3)(2),coalescedRead(ref()(3)(2))+result_32);	\ | ||||
|   } | ||||
|  | ||||
|  | ||||
| #define HAND_DECLARATIONS(a)			\ | ||||
| #define HAND_DECLARATIONS(Simd)			\ | ||||
|   Simd result_00;				\ | ||||
|   Simd result_01;				\ | ||||
|   Simd result_02;				\ | ||||
| @@ -467,18 +483,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|   Simd U_21; | ||||
|  | ||||
| #define ZERO_RESULT				\ | ||||
|   result_00=Zero();				\ | ||||
|   result_01=Zero();				\ | ||||
|   result_02=Zero();				\ | ||||
|   result_10=Zero();				\ | ||||
|   result_11=Zero();				\ | ||||
|   result_12=Zero();				\ | ||||
|   result_20=Zero();				\ | ||||
|   result_21=Zero();				\ | ||||
|   result_22=Zero();				\ | ||||
|   result_30=Zero();				\ | ||||
|   result_31=Zero();				\ | ||||
|   result_32=Zero();			 | ||||
|   result_00=S(0.0,0.0);			\ | ||||
|   result_01=S(0.0,0.0);			\ | ||||
|   result_02=S(0.0,0.0);			\ | ||||
|   result_10=S(0.0,0.0);			\ | ||||
|   result_11=S(0.0,0.0);			\ | ||||
|   result_12=S(0.0,0.0);			\ | ||||
|   result_20=S(0.0,0.0);			\ | ||||
|   result_21=S(0.0,0.0);			\ | ||||
|   result_22=S(0.0,0.0);			\ | ||||
|   result_30=S(0.0,0.0);			\ | ||||
|   result_31=S(0.0,0.0);			\ | ||||
|   result_32=S(0.0,0.0);			 | ||||
|  | ||||
| #define Chimu_00 Chi_00 | ||||
| #define Chimu_01 Chi_01 | ||||
| @@ -502,8 +518,8 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site | ||||
| // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... | ||||
|   typedef typename Simd::scalar_type S; | ||||
|   typedef typename Simd::vector_type V; | ||||
|  | ||||
|   HAND_DECLARATIONS(ignore); | ||||
|   typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; | ||||
|   HAND_DECLARATIONS(Simt); | ||||
|  | ||||
|   int offset,local,perm, ptype; | ||||
|   StencilEntry *SE; | ||||
| @@ -525,8 +541,8 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView | ||||
| { | ||||
|   typedef typename Simd::scalar_type S; | ||||
|   typedef typename Simd::vector_type V; | ||||
|  | ||||
|   HAND_DECLARATIONS(ignore); | ||||
|   typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; | ||||
|   HAND_DECLARATIONS(Simt); | ||||
|  | ||||
|   StencilEntry *SE; | ||||
|   int offset,local,perm, ptype; | ||||
| @@ -549,8 +565,8 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si | ||||
| // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... | ||||
|   typedef typename Simd::scalar_type S; | ||||
|   typedef typename Simd::vector_type V; | ||||
|  | ||||
|   HAND_DECLARATIONS(ignore); | ||||
|   typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; | ||||
|   HAND_DECLARATIONS(Simt); | ||||
|  | ||||
|   int offset,local,perm, ptype; | ||||
|   StencilEntry *SE; | ||||
| @@ -572,8 +588,8 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi | ||||
| { | ||||
|   typedef typename Simd::scalar_type S; | ||||
|   typedef typename Simd::vector_type V; | ||||
|  | ||||
|   HAND_DECLARATIONS(ignore); | ||||
|   typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; | ||||
|   HAND_DECLARATIONS(Simt); | ||||
|  | ||||
|   StencilEntry *SE; | ||||
|   int offset,local,perm, ptype; | ||||
| @@ -596,8 +612,8 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si | ||||
| // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... | ||||
|   typedef typename Simd::scalar_type S; | ||||
|   typedef typename Simd::vector_type V; | ||||
|  | ||||
|   HAND_DECLARATIONS(ignore); | ||||
|   typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; | ||||
|   HAND_DECLARATIONS(Simt); | ||||
|  | ||||
|   int offset, ptype; | ||||
|   StencilEntry *SE; | ||||
| @@ -620,8 +636,8 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi | ||||
| { | ||||
|   typedef typename Simd::scalar_type S; | ||||
|   typedef typename Simd::vector_type V; | ||||
|  | ||||
|   HAND_DECLARATIONS(ignore); | ||||
|   typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; | ||||
|   HAND_DECLARATIONS(Simt); | ||||
|  | ||||
|   StencilEntry *SE; | ||||
|   int offset, ptype; | ||||
| @@ -682,3 +698,4 @@ NAMESPACE_END(Grid); | ||||
| #undef HAND_RESULT | ||||
| #undef HAND_RESULT_INT | ||||
| #undef HAND_RESULT_EXT | ||||
| #undef HAND_DECLARATIONS | ||||
|   | ||||
| @@ -445,20 +445,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | ||||
|  | ||||
|    if( interior && exterior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;} | ||||
| #endif | ||||
|    } else if( interior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt);    return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;} | ||||
| #endif | ||||
|    } else if( exterior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;} | ||||
| #endif | ||||
|    } | ||||
| @@ -476,20 +476,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | ||||
|  | ||||
|    if( interior && exterior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;} | ||||
| #endif | ||||
|    } else if( interior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;} | ||||
| #endif | ||||
|    } else if( exterior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;} | ||||
| #endif | ||||
|    } | ||||
|   | ||||
| @@ -60,11 +60,25 @@ template<class pair> | ||||
| class GpuComplex { | ||||
| public: | ||||
|   pair z; | ||||
|   typedef decltype(z.x) real; | ||||
|   typedef decltype(z.x) Real; | ||||
| public:  | ||||
|   accelerator_inline GpuComplex() = default; | ||||
|   accelerator_inline GpuComplex(real re,real im) { z.x=re; z.y=im; }; | ||||
|   accelerator_inline GpuComplex(Real re,Real im) { z.x=re; z.y=im; }; | ||||
|   accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;}; | ||||
|   accelerator_inline Real real(void) const { return z.x; }; | ||||
|   accelerator_inline Real imag(void) const { return z.y; }; | ||||
|   accelerator_inline GpuComplex &operator*=(const GpuComplex &r) { | ||||
|     *this = (*this) * r; | ||||
|     return *this; | ||||
|   } | ||||
|   accelerator_inline GpuComplex &operator+=(const GpuComplex &r) { | ||||
|     *this = (*this) + r; | ||||
|     return *this; | ||||
|   } | ||||
|   accelerator_inline GpuComplex &operator-=(const GpuComplex &r) { | ||||
|     *this = (*this) - r; | ||||
|     return *this; | ||||
|   } | ||||
|   friend accelerator_inline  GpuComplex operator+(const GpuComplex &lhs,const GpuComplex &rhs) {  | ||||
|     GpuComplex r ;  | ||||
|     r.z.x = lhs.z.x + rhs.z.x;  | ||||
| @@ -157,6 +171,11 @@ typedef GpuVector<NSIMD_RealD,    double      > GpuVectorRD; | ||||
| typedef GpuVector<NSIMD_ComplexD, GpuComplexD > GpuVectorCD; | ||||
| typedef GpuVector<NSIMD_Integer,  Integer     > GpuVectorI; | ||||
|  | ||||
| accelerator_inline GpuComplexF timesI(const GpuComplexF &r)     { return(GpuComplexF(-r.imag(),r.real()));} | ||||
| accelerator_inline GpuComplexD timesI(const GpuComplexD &r)     { return(GpuComplexD(-r.imag(),r.real()));} | ||||
| accelerator_inline GpuComplexF timesMinusI(const GpuComplexF &r){ return(GpuComplexF(r.imag(),-r.real()));} | ||||
| accelerator_inline GpuComplexD timesMinusI(const GpuComplexD &r){ return(GpuComplexD(r.imag(),-r.real()));} | ||||
|  | ||||
| accelerator_inline float half2float(half h) | ||||
| { | ||||
|   float f; | ||||
|   | ||||
| @@ -148,10 +148,14 @@ accelerator_inline void sub (ComplexF * __restrict__ y,const ComplexF * __restri | ||||
| accelerator_inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } | ||||
|    | ||||
| //conjugate already supported for complex | ||||
| accelerator_inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));} | ||||
| accelerator_inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));} | ||||
| accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} | ||||
| accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} | ||||
| accelerator_inline ComplexF timesI(const ComplexF &r)     { return(ComplexF(-r.imag(),r.real()));} | ||||
| accelerator_inline ComplexD timesI(const ComplexD &r)     { return(ComplexD(-r.imag(),r.real()));} | ||||
| accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(ComplexF(r.imag(),-r.real()));} | ||||
| accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(ComplexD(r.imag(),-r.real()));} | ||||
| //accelerator_inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));} | ||||
| //accelerator_inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));} | ||||
| //accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} | ||||
| //accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} | ||||
|  | ||||
| // define projections to real and imaginay parts | ||||
| accelerator_inline ComplexF projReal(const ComplexF &r){return( ComplexF(r.real(), 0.0));} | ||||
|   | ||||
| @@ -64,6 +64,68 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ | ||||
| } | ||||
| #else | ||||
|  | ||||
|  | ||||
| #ifndef GRID_SYCL | ||||
| // Use the scalar as our own complex on GPU | ||||
| template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline | ||||
| typename vsimd::scalar_type | ||||
| coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) | ||||
| { | ||||
|   typedef typename vsimd::scalar_type S; | ||||
|   S * __restrict__ p=(S *)&vec; | ||||
|   return p[lane]; | ||||
| } | ||||
| template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline | ||||
| typename vsimd::scalar_type | ||||
| coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd())) | ||||
| { | ||||
|   typedef typename vsimd::scalar_type S; | ||||
|  | ||||
|   S * __restrict__ p=(S *)&vec; | ||||
|   int mask = vsimd::Nsimd() >> (ptype + 1); | ||||
|   int plane= doperm ? lane ^ mask : lane; | ||||
|   return p[plane]; | ||||
| } | ||||
| template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline | ||||
| void coalescedWrite(vsimd & __restrict__ vec, | ||||
| 		    const typename vsimd::scalar_type & __restrict__ extracted, | ||||
| 		    int lane=acceleratorSIMTlane(vsimd::Nsimd())) | ||||
| { | ||||
|   typedef typename vsimd::scalar_type S; | ||||
|   S * __restrict__ p=(S *)&vec; | ||||
|   p[lane]=extracted; | ||||
| } | ||||
| #else | ||||
| template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline | ||||
| typename vsimd::vector_type::datum | ||||
| coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) | ||||
| { | ||||
|   typedef typename vsimd::vector_type::datum S; | ||||
|   S * __restrict__ p=(S *)&vec; | ||||
|   return p[lane]; | ||||
| } | ||||
| template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline | ||||
| typename vsimd::vector_type::datum | ||||
| coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd())) | ||||
| { | ||||
|   typedef typename vsimd::vector_type::datum S; | ||||
|  | ||||
|   S * __restrict__ p=(S *)&vec; | ||||
|   int mask = vsimd::Nsimd() >> (ptype + 1); | ||||
|   int plane= doperm ? lane ^ mask : lane; | ||||
|   return p[plane]; | ||||
| } | ||||
| template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline | ||||
| void coalescedWrite(vsimd & __restrict__ vec, | ||||
| 		    const typename vsimd::vector_type::datum & __restrict__ extracted, | ||||
| 		    int lane=acceleratorSIMTlane(vsimd::Nsimd())) | ||||
| { | ||||
|   typedef typename vsimd::vector_type::datum S; | ||||
|   S * __restrict__ p=(S *)&vec; | ||||
|   p[lane]=extracted; | ||||
| } | ||||
| #endif | ||||
|  | ||||
| ////////////////////////////////////////// | ||||
| // Extract and insert slices on the GPU | ||||
| ////////////////////////////////////////// | ||||
|   | ||||
| @@ -104,7 +104,7 @@ extern int acceleratorAbortOnGpuError; | ||||
|  | ||||
| accelerator_inline int acceleratorSIMTlane(int Nsimd) { | ||||
| #ifdef GRID_SIMT | ||||
|   return threadIdx.z;  | ||||
|   return threadIdx.x;  | ||||
| #else | ||||
|   return 0; | ||||
| #endif | ||||
| @@ -112,28 +112,67 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { | ||||
|  | ||||
| #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\ | ||||
|   {									\ | ||||
|     int nt=acceleratorThreads();					\ | ||||
|     typedef uint64_t Iterator;						\ | ||||
|     auto lambda = [=] accelerator					\ | ||||
|       (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\ | ||||
|       __VA_ARGS__;							\ | ||||
|     };									\ | ||||
|     int nt=acceleratorThreads();					\ | ||||
|     dim3 cu_threads(acceleratorThreads(),1,nsimd);			\ | ||||
|     dim3 cu_threads(nsimd,acceleratorThreads(),1);			\ | ||||
|     dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\ | ||||
|     LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\ | ||||
|   } | ||||
|  | ||||
| #define accelerator_for6dNB(iter1, num1,				\ | ||||
|                             iter2, num2,				\ | ||||
|                             iter3, num3,				\ | ||||
|                             iter4, num4,				\ | ||||
|                             iter5, num5,				\ | ||||
| 			    iter6, num6, ... )				\ | ||||
|   {									\ | ||||
|     typedef uint64_t Iterator;						\ | ||||
|     auto lambda = [=] accelerator					\ | ||||
|       (Iterator iter1,Iterator iter2,					\ | ||||
|        Iterator iter3,Iterator iter4,					\ | ||||
|        Iterator iter5,Iterator iter6) mutable {				\ | ||||
|       __VA_ARGS__;							\ | ||||
|     };									\ | ||||
|     dim3 cu_blocks (num1,num2,num3);					\ | ||||
|     dim3 cu_threads(num4,num5,num6);					\ | ||||
|     Lambda6Apply<<<cu_blocks,cu_threads>>>(num1,num2,num3,num4,num5,num6,lambda); \ | ||||
|   } | ||||
|  | ||||
| template<typename lambda>  __global__ | ||||
| void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) | ||||
| { | ||||
|   uint64_t x = threadIdx.x + blockDim.x*blockIdx.x; | ||||
|   uint64_t y = threadIdx.y + blockDim.y*blockIdx.y; | ||||
|   uint64_t z = threadIdx.z; | ||||
|   // Weird permute is to make lane coalesce for large blocks | ||||
|   uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; | ||||
|   uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; | ||||
|   uint64_t z = threadIdx.x; | ||||
|   if ( (x < num1) && (y<num2) && (z<num3) ) { | ||||
|     Lambda(x,y,z); | ||||
|   } | ||||
| } | ||||
|  | ||||
| template<typename lambda>  __global__ | ||||
| void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, | ||||
| 		  uint64_t num4, uint64_t num5, uint64_t num6, | ||||
| 		  lambda Lambda) | ||||
| { | ||||
|   uint64_t iter1 = blockIdx.x; | ||||
|   uint64_t iter2 = blockIdx.y; | ||||
|   uint64_t iter3 = blockIdx.z; | ||||
|   uint64_t iter4 = threadIdx.x; | ||||
|   uint64_t iter5 = threadIdx.y; | ||||
|   uint64_t iter6 = threadIdx.z; | ||||
|  | ||||
|   if ( (iter1 < num1) && (iter2<num2) && (iter3<num3) | ||||
|     && (iter4 < num4) && (iter5<num5) && (iter6<num6) ) | ||||
|   { | ||||
|     Lambda(iter1,iter2,iter3,iter4,iter5,iter6); | ||||
|   } | ||||
| } | ||||
|  | ||||
| #define accelerator_barrier(dummy)					\ | ||||
|   {									\ | ||||
|     cudaDeviceSynchronize();						\ | ||||
| @@ -221,7 +260,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { | ||||
|       cl::sycl::range<3> global{unum1,unum2,nsimd};			\ | ||||
|       cgh.parallel_for<class dslash>(					\ | ||||
|       cl::sycl::nd_range<3>(global,local), \ | ||||
|       [=] (cl::sycl::nd_item<3> item) mutable {       \ | ||||
|       [=] (cl::sycl::nd_item<3> item) /*mutable*/ {   \ | ||||
|       auto iter1    = item.get_global_id(0);	      \ | ||||
|       auto iter2    = item.get_global_id(1);	      \ | ||||
|       auto lane     = item.get_global_id(2);	      \ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user