mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-30 19:44:32 +00:00 
			
		
		
		
	clean up; Exch1 sp integrate, tested, working
This commit is contained in:
		| @@ -60,36 +60,20 @@ typedef __SVUint64_t  lutd __attribute__((arm_sve_vector_bits(512))); // LUTs fo | |||||||
| #pragma error("Oops. Illegal SVE vector size!?") | #pragma error("Oops. Illegal SVE vector size!?") | ||||||
| #endif /* __ARM_FEATURE_SVE_BITS */ | #endif /* __ARM_FEATURE_SVE_BITS */ | ||||||
|  |  | ||||||
| // safety definition, not sure if it's necessary |  | ||||||
| //#define GEN_SIMD_WIDTH 64u |  | ||||||
|  |  | ||||||
| // low-level API | // low-level API | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
| NAMESPACE_BEGIN(Optimization); | NAMESPACE_BEGIN(Optimization); | ||||||
|  |  | ||||||
| // convenience union types for tables eliminate loads | // convenience union types for tables eliminating loads | ||||||
| union ulutf { | union ulutf { | ||||||
|   lutf v; |   lutf v; | ||||||
|   uint32_t s[16]; |   uint32_t s[16]; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| union ulutd { | union ulutd { | ||||||
|   lutd v; |   lutd v; | ||||||
|   uint64_t s[8]; |   uint64_t s[8]; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| // FIXME convenience union types for Exchange1 |  | ||||||
| union uvecf { |  | ||||||
|   vecf v; |  | ||||||
|   float32_t s[16]; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| union uvecd { |  | ||||||
|   vecd v; |  | ||||||
|   float64_t s[8]; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
| template <typename T> | template <typename T> | ||||||
| struct acle{}; | struct acle{}; | ||||||
|  |  | ||||||
| @@ -144,6 +128,18 @@ struct acle<float>{ | |||||||
|     const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} }; |     const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} }; | ||||||
|     return t.v; |     return t.v; | ||||||
|   } |   } | ||||||
|  |   static inline lutf tbl_exch1a(){ // Exchange1 | ||||||
|  |     const ulutf t = { .s = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 } }; | ||||||
|  |     return t.v; | ||||||
|  |   } | ||||||
|  |   static inline lutf tbl_exch1b(){ // Exchange1 | ||||||
|  |     const ulutf t = { .s = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 } }; | ||||||
|  |     return t.v; | ||||||
|  |   } | ||||||
|  |   static inline lutf tbl_exch1c(){ // Exchange1 | ||||||
|  |     const ulutf t = { .s = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7} }; | ||||||
|  |     return t.v; | ||||||
|  |   } | ||||||
|   static inline pred pg1(){return svptrue_b32();} |   static inline pred pg1(){return svptrue_b32();} | ||||||
|   static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} |   static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} | ||||||
|   static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} |   static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} | ||||||
| @@ -191,7 +187,6 @@ struct Vsplat{ | |||||||
|   } |   } | ||||||
|   // Integer |   // Integer | ||||||
|   inline veci operator()(Integer a){ |   inline veci operator()(Integer a){ | ||||||
|     // Add check whether Integer is really a uint32_t??? |  | ||||||
|     return svdup_u32(a); |     return svdup_u32(a); | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
| @@ -538,9 +533,6 @@ struct PrecisionChange { | |||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| #define VECTOR_FOR(i, w, inc)                   \ |  | ||||||
| for (unsigned int i = 0; i < w; i += inc) |  | ||||||
|  |  | ||||||
| struct Exchange{ | struct Exchange{ | ||||||
|   // float |   // float | ||||||
|   static inline void Exchange0(vecf &out1, vecf &out2, vecf in1, vecf in2){ |   static inline void Exchange0(vecf &out1, vecf &out2, vecf in1, vecf in2){ | ||||||
| @@ -550,25 +542,18 @@ struct Exchange{ | |||||||
|     out2 = svext(in1, r2_v, (uint64_t)8u); |     out2 = svext(in1, r2_v, (uint64_t)8u); | ||||||
|   } |   } | ||||||
|   static inline void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2){ |   static inline void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2){ | ||||||
|     // FIXME |     // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 | ||||||
|     uvecf v1 = { .v = in1 }; |     // alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI | ||||||
|     uvecf v2 = { .v = in2 }; |     lutf tbl_exch1a = acle<float>::tbl_exch1a(); | ||||||
|     uvecf o1, o2; |     lutf tbl_exch1b = acle<float>::tbl_exch1b(); | ||||||
|  |     lutf tbl_exch1c = acle<float>::tbl_exch1c(); | ||||||
|  |  | ||||||
|     const int n = 1; |     vecf a1_v = svtbl(in1, tbl_exch1a); | ||||||
|     const int w = 16; // w = W<T>::r |     vecf a2_v = svtbl(in2, tbl_exch1b); | ||||||
|     unsigned int mask = w >> (n + 1); |     vecf b1_v  = svext(a2_v, a1_v, (uint64_t)8u); | ||||||
|     //      std::cout << " Exchange "<<n<<" nsimd "<<w<<" mask 0x" <<std::hex<<mask<<std::dec<<std::endl; |     vecf b2_v  = svext(a1_v, a2_v, (uint64_t)8u); | ||||||
|     VECTOR_FOR(i, w, 1) { |     out1 = svtbl(b1_v, tbl_exch1c); | ||||||
|       int j1 = i&(~mask); |     out2 = svtbl(b2_v, tbl_exch1a); | ||||||
|       if  ( (i&mask) == 0 ) { o1.s[i]=v1.s[j1];} |  | ||||||
|       else                  { o1.s[i]=v2.s[j1];} |  | ||||||
|       int j2 = i|mask; |  | ||||||
|       if  ( (i&mask) == 0 ) { o2.s[i]=v1.s[j2];} |  | ||||||
|       else                  { o2.s[i]=v2.s[j2];} |  | ||||||
|     } |  | ||||||
|     out1 = o1.v; |  | ||||||
|     out2 = o2.v; |  | ||||||
|   } |   } | ||||||
|   static inline void Exchange2(vecf &out1, vecf &out2, vecf in1, vecf in2){ |   static inline void Exchange2(vecf &out1, vecf &out2, vecf in1, vecf in2){ | ||||||
|     out1 = (vecf)svtrn1((vecd)in1, (vecd)in2); |     out1 = (vecf)svtrn1((vecd)in1, (vecd)in2); | ||||||
| @@ -588,6 +573,7 @@ struct Exchange{ | |||||||
|   } |   } | ||||||
|   static inline void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2){ |   static inline void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2){ | ||||||
|     // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 |     // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 | ||||||
|  |     // alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI | ||||||
|     lutd tbl_exch1a = acle<double>::tbl_exch1a(); |     lutd tbl_exch1a = acle<double>::tbl_exch1a(); | ||||||
|     lutd tbl_exch1b = acle<double>::tbl_exch1b(); |     lutd tbl_exch1b = acle<double>::tbl_exch1b(); | ||||||
|     lutd tbl_exch1c = acle<double>::tbl_exch1c(); |     lutd tbl_exch1c = acle<double>::tbl_exch1c(); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user