From 69ae817d1c59b94c9e1dfc0860831982bc49b337 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 8 Dec 2016 16:43:28 +0000 Subject: [PATCH 01/18] Updates for supporting Mobius better --- lib/simd/Grid_avx.h | 27 ++++++++++++++++++++++++++- lib/simd/Grid_avx512.h | 25 +++++++++++++++++++++++++ lib/simd/Grid_generic.h | 16 ++++++++++++++++ lib/simd/Grid_qpx.h | 9 +++++++++ lib/simd/Grid_sse4.h | 25 +++++++++++++++++++++++++ lib/simd/Grid_vector_types.h | 33 +++++++++++++++++++++++++++++++-- lib/simd/Intel512avx.h | 16 ++++++++++++---- lib/simd/Intel512common.h | 11 ++++++----- lib/simd/Intel512double.h | 2 ++ lib/simd/Intel512single.h | 2 ++ 10 files changed, 154 insertions(+), 12 deletions(-) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 36360102..e2729187 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -213,6 +213,29 @@ namespace Optimization { } }; + struct MultRealPart{ + inline __m256 operator()(__m256 a, __m256 b){ + __m256 ymm0; + ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m256d operator()(__m256d a, __m256d b){ + __m256d ymm0; + ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } + }; + struct MaddRealPart{ + inline __m256 operator()(__m256 a, __m256 b, __m256 c){ + __m256 ymm0 = _mm256_moveldup_ps(a); // ymm0 <- ar ar, + _mm256_add_ps(_mm256_mul_ps( ymm0, b),c); + } + inline __m256d operator()(__m256d a, __m256d b, __m256d c){ + __m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 ); + return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c); + } + }; + struct MultComplex{ // Complex float inline __m256 operator()(__m256 a, __m256 b){ @@ -627,7 +650,9 @@ namespace Optimization { typedef Optimization::Sub SubSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index d6531d57..ebf99e16 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -189,6 +189,29 @@ namespace Optimization { // 2mul,4 mac +add+sub = 8 flop type insns // 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load. + struct MultRealPart{ + inline __m512 operator()(__m512 a, __m512 b){ + __m512 ymm0; + ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar, + return _mm512_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m512d operator()(__m512d a, __m512d b){ + __m512d ymm0; + ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm512_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } + }; + struct MaddRealPart{ + inline __m512 operator()(__m512 a, __m512 b, __m512 c){ + __m512 ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar, + return _mm512_fmadd_ps( ymm0, b, c); + } + inline __m512d operator()(__m512d a, __m512d b, __m512d c){ + __m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 ); + return _mm512_fmadd_pd( ymm0, b, c); + } + }; + struct MultComplex{ // Complex float inline __m512 operator()(__m512 a, __m512 b){ @@ -501,6 +524,8 @@ namespace Optimization { typedef Optimization::Mult MultSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 62c78afb..91e9cda2 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -224,6 +224,21 @@ namespace Optimization { #define cmul(a, b, c, i)\ c[i] = a[i]*b[i] - a[i+1]*b[i+1];\ c[i+1] = a[i]*b[i+1] + a[i+1]*b[i]; + + struct MultRealPart{ + template + inline vec operator()(vec a, vec b){ + vec out; + + VECTOR_FOR(i, W::c, 1) + { + out.v[2*i] = a[2*i]*b[2*i]; + out.v[2*i+1] = a[2*i]*b[2*i+1]; + } + return out; + }; + }; + struct MultComplex{ // Complex @@ -456,6 +471,7 @@ namespace Optimization { typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index bc86291d..99a9ea68 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -220,6 +220,14 @@ namespace Optimization { } }; + struct MultRealPart{ + // Complex double + inline vector4double operator()(vector4double a, vector4double b){ + // return vec_xmul(b, a); + return vec_xmul(a, b); + } + FLOAT_WRAP_2(operator(), inline) + }; struct MultComplex{ // Complex double inline vector4double operator()(vector4double a, vector4double b){ @@ -430,6 +438,7 @@ typedef Optimization::Sub SubSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 560eda11..abd688ab 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -177,6 +177,29 @@ namespace Optimization { } }; + struct MultRealPart{ + inline __m128 operator()(__m128 a, __m128 b){ + __m128 ymm0; + ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m128d operator()(__m128d a, __m128d b){ + __m128d ymm0; + ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } + }; + struct MaddRealPart{ + inline __m128 operator()(__m128 a, __m128 b, __m128 c){ + __m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + _mm_add_ps(_mm_mul_ps( ymm0, b),c); + } + inline __m128d operator()(__m128d a, __m128d b, __m128 c){ + __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 ); + return _mm_add_pd(_mm_mul_pd( ymm0, b),c); + } + }; + struct MultComplex{ // Complex float inline __m128 operator()(__m128 a, __m128 b){ @@ -415,6 +438,8 @@ namespace Optimization { typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 42f28b34..8a6ab2e7 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -101,6 +101,11 @@ template using IfNotInteger = Invoke +Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) { + return op(src_1, src_2, src_3); +} + template Out binary(Input1 src_1, Input2 src_2, Operation op) { return op(src_1, src_2); @@ -178,6 +183,7 @@ class Grid_simd { const Grid_simd *__restrict__ r) { *y = (*l) * (*r); } + friend inline void sub(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r) { @@ -188,7 +194,6 @@ class Grid_simd { const Grid_simd *__restrict__ r) { *y = (*l) + (*r); } - friend inline void mac(Grid_simd *__restrict__ y, const Scalar_type *__restrict__ a, const Grid_simd *__restrict__ x) { @@ -260,7 +265,7 @@ class Grid_simd { } //////////////////////////// - // opreator scalar * simd + // operator scalar * simd //////////////////////////// friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) { Grid_simd va; @@ -433,6 +438,11 @@ inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); } +template =0> +inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ + S* typepun =(S*) &src; + ret.v = unary(real(typepun[lane]), VsplatSIMD()); +} /////////////////////// // Splat @@ -449,6 +459,10 @@ template inline void vsplat(Grid_simd &ret, EnableIf, S> c) { vsplat(ret, real(c), imag(c)); } +template +inline void rsplat(Grid_simd &ret, EnableIf, S> c) { + vsplat(ret, real(c), real(c)); +} // if real fill with a, if complex fill with a in the real part (first function // above) @@ -550,6 +564,21 @@ inline Grid_simd operator-(Grid_simd a, Grid_simd b) { return ret; }; +// Distinguish between complex types and others +template = 0> +inline Grid_simd real_mult(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, MultRealPartSIMD()); + return ret; +}; +template = 0> +inline Grid_simd real_madd(Grid_simd a, Grid_simd b, Grid_simd c) { + Grid_simd ret; + ret.v = trinary(a.v, b.v, c.v, MaddRealPartSIMD()); + return ret; +}; + + // Distinguish between complex types and others template = 0> inline Grid_simd operator*(Grid_simd a, Grid_simd b) { diff --git a/lib/simd/Intel512avx.h b/lib/simd/Intel512avx.h index 19157db4..7b5964ad 100644 --- a/lib/simd/Intel512avx.h +++ b/lib/simd/Intel512avx.h @@ -95,10 +95,14 @@ Author: paboyle #define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2 #define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n" -#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n" -#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n" -#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n" -#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n" +#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n" +#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n" +#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n" +#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n" +#define VBCASTCDUPf(OFF,A,DEST) "vbroadcastsd (" #OFF "*64 )(" #A ")," #DEST ";\n" +#define VBCASTZDUPf(OFF,A,DEST) "vbroadcastf32x4 (" #OFF "*64 )(" #A ")," #DEST ";\n" +#define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST) +#define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST) #define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n" #define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n" @@ -106,11 +110,15 @@ Author: paboyle #define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n" +#define VMADDRDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" +#define VMADDIDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" #define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" #define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" #define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" #define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" +#define VMADDRDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" +#define VMADDIDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index cfa20c26..e69e541c 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -87,7 +87,8 @@ Author: paboyle VACCTIMESMINUSI1d(A,ACC,tmp) \ VACCTIMESMINUSI2d(A,ACC,tmp) -#define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A ); +#define LOAD64a(A,ptr) "movq %0, %" #A : : "r"(ptr) : #A +#define LOAD64i(A,ptr) __asm__ ( LOAD64a(A,ptr)); #define LOAD64(A,ptr) LOAD64i(A,ptr) #define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" @@ -108,8 +109,8 @@ Author: paboyle //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n" // "clevict0 "#O"*64("#A");\n" -#define VLOADf(OFF,PTR,DEST) "vmovaps " #OFF "*64(" #PTR "), " #DEST ";\n" -#define VLOADd(OFF,PTR,DEST) "vmovapd " #OFF "*64(" #PTR "), " #DEST ";\n" +#define VLOADf(OFF,PTR,DEST) "vmovups " #OFF "*64(" #PTR "), " #DEST ";\n" +#define VLOADd(OFF,PTR,DEST) "vmovupd " #OFF "*64(" #PTR "), " #DEST ";\n" #define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n" #define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n" @@ -143,8 +144,8 @@ Author: paboyle #define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n" #else -#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" -#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#define VSTOREf(OFF,PTR,SRC) "vmovups " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#define VSTOREd(OFF,PTR,SRC) "vmovupd " #SRC "," #OFF "*64(" #PTR ")" ";\n" #endif // Swaps Re/Im ; could unify this with IMCI diff --git a/lib/simd/Intel512double.h b/lib/simd/Intel512double.h index 224c593d..632b5639 100644 --- a/lib/simd/Intel512double.h +++ b/lib/simd/Intel512double.h @@ -144,10 +144,12 @@ Author: paboyle #define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum) #define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum) #define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum) +#undef VMADDRDUP #undef VMADDSUBRDUP #undef VMADDSUBIDUP #undef VMULRDUP #undef VMULIDUP +#define VMADDRDUP(O,P,B,accum) VMADDRDUPd(O,P,B,accum) #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) #define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum) diff --git a/lib/simd/Intel512single.h b/lib/simd/Intel512single.h index 3fa47668..ed135651 100644 --- a/lib/simd/Intel512single.h +++ b/lib/simd/Intel512single.h @@ -144,10 +144,12 @@ Author: paboyle #define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum) #define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum) +#undef VMADDRDUP #undef VMADDSUBRDUP #undef VMADDSUBIDUP #undef VMULRDUP #undef VMULIDUP +#define VMADDRDUP(O,P,B,accum) VMADDRDUPf(O,P,B,accum) #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) #define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum) From 7a61feb6d3c49637ab3afb4182a9915734ba6e48 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 8 Dec 2016 16:58:01 +0000 Subject: [PATCH 02/18] Allocator added with caching for Linux VM subsystem optimisation --- lib/AlignedAllocator.cc | 65 +++++++++++++++++++++++++++++++++++++++++ lib/AlignedAllocator.h | 49 ++++++++++++++++++++++--------- 2 files changed, 101 insertions(+), 13 deletions(-) create mode 100644 lib/AlignedAllocator.cc diff --git a/lib/AlignedAllocator.cc b/lib/AlignedAllocator.cc new file mode 100644 index 00000000..9df4ec1c --- /dev/null +++ b/lib/AlignedAllocator.cc @@ -0,0 +1,65 @@ + + + +#include + +namespace Grid { + +int PointerCache::victim; + + PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; + +void *PointerCache::Insert(void *ptr,size_t bytes) { + + if (bytes < 4096 ) return NULL; + +#ifdef _OPENMP + assert(omp_in_parallel()==0); +#endif + void * ret = NULL; + int v = -1; + + for(int e=0;e namespace Grid { + class PointerCache { + private: + + static const int Ncache=8; + static int victim; + + typedef struct { + void *address; + size_t bytes; + int valid; + } PointerCacheEntry; + + static PointerCacheEntry Entries[Ncache]; + + public: + + + static void *Insert(void *ptr,size_t bytes) ; + static void *Lookup(size_t bytes) ; + + }; + //////////////////////////////////////////////////////////////////// // A lattice of something, but assume the something is SIMDized. //////////////////////////////////////////////////////////////////// + template class alignedAllocator { public: @@ -66,27 +89,27 @@ public: pointer allocate(size_type __n, const void* _p= 0) { + size_type bytes = __n*sizeof(_Tp); + + _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); + #ifdef HAVE_MM_MALLOC_H - _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); + if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128); #else - _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); + if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes); #endif - _Tp tmp; -#ifdef GRID_NUMA -#pragma omp parallel for schedule(static) - for(int i=0;i<__n;i++){ - ptr[i]=tmp; - } -#endif return ptr; } - void deallocate(pointer __p, size_type) { + void deallocate(pointer __p, size_type __n) { + size_type bytes = __n * sizeof(_Tp); + pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); + #ifdef HAVE_MM_MALLOC_H - _mm_free((void *)__p); + if ( __freeme ) _mm_free((void *)__freeme); #else - free((void *)__p); + if ( __freeme ) free((void *)__freeme); #endif } void construct(pointer __p, const _Tp& __val) { }; From 83fa038bdfb9fbaf9ecbeca0d7150a7dfc903edb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 8 Dec 2016 16:58:42 +0000 Subject: [PATCH 03/18] Streaming stores --- lib/Stencil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index 5c3a5ef9..89533b82 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -113,7 +113,7 @@ Gather_plane_simple_table (std::vector >& table,const Lattice { PARALLEL_FOR_LOOP for(int i=0;i Date: Thu, 8 Dec 2016 17:00:32 +0000 Subject: [PATCH 04/18] Ready for sim --- benchmarks/Benchmark_mooee.cc | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index dfaea627..e8f0d16b 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -113,6 +113,22 @@ int main (int argc, char ** argv) std::cout<Barrier(); \ + t0=usecond(); \ + for(int i=0;iBarrier(); \ + Dw.CayleyReport(); \ + std::cout< Date: Thu, 8 Dec 2016 17:28:28 +0000 Subject: [PATCH 05/18] Lots of debug on performance Mobius --- lib/qcd/action/fermion/CayleyFermion5D.cc | 34 +- lib/qcd/action/fermion/CayleyFermion5D.h | 5 + lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 337 +++++++++++++++---- lib/qcd/action/fermion/FermionOperator.h | 2 + lib/qcd/action/fermion/WilsonFermion.cc | 4 +- lib/qcd/action/fermion/WilsonFermion.h | 3 + lib/qcd/action/fermion/WilsonFermion5D.cc | 3 +- lib/qcd/action/fermion/WilsonFermion5D.h | 3 + 8 files changed, 304 insertions(+), 87 deletions(-) diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc index b8e98dce..d8978890 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.cc +++ b/lib/qcd/action/fermion/CayleyFermion5D.cc @@ -54,12 +54,11 @@ template void CayleyFermion5D::Dminus(const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - FermionField tmp(psi._grid); - this->DW(psi,tmp,DaggerNo); + this->DW(psi,this->tmp(),DaggerNo); for(int s=0;stmp(),s,s);// chi = (1-c[s] D_W) psi } } @@ -87,8 +86,8 @@ template void CayleyFermion5D::CayleyReport(void) std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; - // Flops = 9*12*Ls*vol/2 - RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting + // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex + RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; } @@ -110,12 +109,11 @@ template void CayleyFermion5D::DminusDag(const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - FermionField tmp(psi._grid); - this->DW(psi,tmp,DaggerYes); + this->DW(psi,this->tmp(),DaggerYes); for(int s=0;stmp(),s,s);// chi = (1-c[s] D_W) psi } } template @@ -138,6 +136,7 @@ void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &D lower[0] =-mass*lower[0]; M5D(psi,psi,Din,lower,diag,upper); } +// FIXME Redunant with the above routine; check this and eliminate template void CayleyFermion5D::Meo5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; @@ -259,36 +258,33 @@ template void CayleyFermion5D::Meooe (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - FermionField tmp(psi._grid); - Meooe5D(psi,tmp); + Meooe5D(psi,this->tmp()); if ( psi.checkerboard == Odd ) { - this->DhopEO(tmp,chi,DaggerNo); + this->DhopEO(this->tmp(),chi,DaggerNo); } else { - this->DhopOE(tmp,chi,DaggerNo); + this->DhopOE(this->tmp(),chi,DaggerNo); } } template void CayleyFermion5D::MeooeDag (const FermionField &psi, FermionField &chi) { - FermionField tmp(psi._grid); // Apply 4d dslash if ( psi.checkerboard == Odd ) { - this->DhopEO(psi,tmp,DaggerYes); + this->DhopEO(psi,this->tmp(),DaggerYes); } else { - this->DhopOE(psi,tmp,DaggerYes); + this->DhopOE(psi,this->tmp(),DaggerYes); } - MeooeDag5D(tmp,chi); + MeooeDag5D(this->tmp(),chi); } template void CayleyFermion5D::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){ - FermionField tmp(psi._grid); - Meo5D(psi,tmp); + Meo5D(psi,this->tmp()); // Apply 4d dslash fragment - this->DhopDir(tmp,chi,dir,disp); + this->DhopDir(this->tmp(),chi,dir,disp); } // force terms; five routines; default to Dhop on diagonal template diff --git a/lib/qcd/action/fermion/CayleyFermion5D.h b/lib/qcd/action/fermion/CayleyFermion5D.h index 6fb58234..0eb68034 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.h +++ b/lib/qcd/action/fermion/CayleyFermion5D.h @@ -76,6 +76,11 @@ namespace Grid { std::vector &diag, std::vector &upper); void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv); + void MooeeInternalAsm(const FermionField &in, FermionField &out, + int LLs, int site, + Vector > &Matp, + Vector > &Matm); + virtual void Instantiatable(void)=0; diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index 35a10de2..29f10b0a 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -34,8 +34,7 @@ Author: paboyle namespace Grid { -namespace QCD { - /* +namespace QCD { /* * Dense matrix versions of routines */ template @@ -126,7 +125,6 @@ PARALLEL_FOR_LOOP for(int v=0;v(hp_00.v); hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); @@ -165,42 +160,20 @@ PARALLEL_FOR_LOOP hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); } - /* - if ( ss==0) std::cout << " dphi_00 " <::M5Ddag(const FermionField &psi, M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=LLs){ // adds LLs - +#if 0 alignas(64) SiteHalfSpinor hp; alignas(64) SiteHalfSpinor hm; alignas(64) SiteSpinor fp; @@ -287,9 +260,231 @@ PARALLEL_FOR_LOOP chi[ss+v] = chi[ss+v] +l[v]*fm; } +#else + for(int v=0;v(hp_00.v); + hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); + hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); + hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); + hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); + hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); + } + if ( vm>=v ) { + hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); + hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); + hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); + hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); + hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); + hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); + } + + Simd p_00 = real_mult(d[v]()()(), phi[ss+v]()(0)(0)) + real_mult(u[v]()()(),hp_00); + Simd p_01 = real_mult(d[v]()()(), phi[ss+v]()(0)(1)) + real_mult(u[v]()()(),hp_01); + Simd p_02 = real_mult(d[v]()()(), phi[ss+v]()(0)(2)) + real_mult(u[v]()()(),hp_02); + Simd p_10 = real_mult(d[v]()()(), phi[ss+v]()(1)(0)) + real_mult(u[v]()()(),hp_10); + Simd p_11 = real_mult(d[v]()()(), phi[ss+v]()(1)(1)) + real_mult(u[v]()()(),hp_11); + Simd p_12 = real_mult(d[v]()()(), phi[ss+v]()(1)(2)) + real_mult(u[v]()()(),hp_12); + + Simd p_20 = real_mult(d[v]()()(), phi[ss+v]()(2)(0)) + real_mult(l[v]()()(),hm_00); + Simd p_21 = real_mult(d[v]()()(), phi[ss+v]()(2)(1)) + real_mult(l[v]()()(),hm_01); + Simd p_22 = real_mult(d[v]()()(), phi[ss+v]()(2)(2)) + real_mult(l[v]()()(),hm_02); + Simd p_30 = real_mult(d[v]()()(), phi[ss+v]()(3)(0)) + real_mult(l[v]()()(),hm_10); + Simd p_31 = real_mult(d[v]()()(), phi[ss+v]()(3)(1)) + real_mult(l[v]()()(),hm_11); + Simd p_32 = real_mult(d[v]()()(), phi[ss+v]()(3)(2)) + real_mult(l[v]()()(),hm_12); + + vstream(chi[ss+v]()(0)(0),p_00); + vstream(chi[ss+v]()(0)(1),p_01); + vstream(chi[ss+v]()(0)(2),p_02); + vstream(chi[ss+v]()(1)(0),p_10); + vstream(chi[ss+v]()(1)(1),p_11); + vstream(chi[ss+v]()(1)(2),p_12); + vstream(chi[ss+v]()(2)(0),p_20); + vstream(chi[ss+v]()(2)(1),p_21); + vstream(chi[ss+v]()(2)(2),p_22); + vstream(chi[ss+v]()(3)(0),p_30); + vstream(chi[ss+v]()(3)(1),p_31); + vstream(chi[ss+v]()(3)(2),p_32); + } +#endif } M5Dtime+=usecond(); } + + +#include +#include +#include + + +template +void CayleyFermion5D::MooeeInternalAsm(const FermionField &psi, FermionField &chi, + int LLs, int site, + Vector > &Matp, + Vector > &Matm) +{ +#if 0 + { + SiteHalfSpinor BcastP; + SiteHalfSpinor BcastM; + SiteHalfSpinor SiteChiP; + SiteHalfSpinor SiteChiM; + + // Ls*Ls * 2 * 12 * vol flops + for(int s1=0;s1); + for(int s1=0;s1 void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv) { @@ -342,37 +537,38 @@ void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField for(int s1=0;s1 SitePplus(LLs); - Vector SitePminus(LLs); - Vector SiteChiP(LLs); - Vector SiteChiM(LLs); - Vector SiteChi(LLs); - - SiteHalfSpinor BcastP; - SiteHalfSpinor BcastM; + std::vector SitePplus(LLs); + std::vector SitePminus(LLs); + std::vector SiteChiP(LLs); + std::vector SiteChiM(LLs); + std::vector SiteChi(LLs); #pragma omp for for(auto site=0;site::MooeeInternal(const FermionField &psi, FermionField SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP; SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM; } - s++; - }} + s++; + } + } for(int s=0;s::MooeeInternal(const FermionField &psi, FermionField accumRecon5m(SiteChi[s],SiteChiM[s]); chi[lex] = SiteChi[s]*0.5; } + }} +#else + PARALLEL_FOR_LOOP + for(auto site=0;site::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, LebesgueEvenOdd(_cbgrid), Umu(&Fgrid), UmuEven(&Hgrid), - UmuOdd(&Hgrid) { + UmuOdd(&Hgrid), + _tmp(&Hgrid) +{ // Allocate the required comms buffer ImportGauge(_Umu); } diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index 40fbd1bf..933be732 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -58,6 +58,9 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { GridBase *FermionGrid(void) { return _grid; } GridBase *FermionRedBlackGrid(void) { return _cbgrid; } + FermionField _tmp; + FermionField &tmp(void) { return _tmp; } + ////////////////////////////////////////////////////////////////// // override multiply; cut number routines if pass dagger argument // and also make interface more uniformly consistent diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index d2ac96e3..d70c98c3 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -60,7 +60,8 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, UmuEven(_FourDimRedBlackGrid), UmuOdd (_FourDimRedBlackGrid), Lebesgue(_FourDimGrid), - LebesgueEvenOdd(_FourDimRedBlackGrid) + LebesgueEvenOdd(_FourDimRedBlackGrid), + _tmp(&FiveDimRedBlackGrid) { if (Impl::LsVectorised) { diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index ffb5c58e..fb4fa925 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -74,6 +74,9 @@ namespace QCD { typedef WilsonKernels Kernels; PmuStat stat; + FermionField _tmp; + FermionField &tmp(void) { return _tmp; } + void Report(void); void ZeroCounters(void); double DhopCalls; From 0091b50f499c17b17208febe24b466aa9cf8ed97 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 9 Dec 2016 22:51:32 +0000 Subject: [PATCH 06/18] Zmobius working -- not asm yet --- lib/qcd/action/fermion/CayleyFermion5D.cc | 86 +++++- lib/qcd/action/fermion/CayleyFermion5D.h | 18 ++ lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 305 ++++++++++++------- 3 files changed, 303 insertions(+), 106 deletions(-) diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc index d8978890..781380e5 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.cc +++ b/lib/qcd/action/fermion/CayleyFermion5D.cc @@ -29,6 +29,7 @@ Author: paboyle *************************************************************************************/ /* END LEGAL */ +#include #include @@ -48,7 +49,8 @@ namespace QCD { FourDimGrid, FourDimRedBlackGrid,_M5,p), mass(_mass) - { } + { + } template void CayleyFermion5D::Dminus(const FermionField &psi, FermionField &chi) @@ -455,9 +457,91 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vectorMooeeInternalCompute(0,inv,MatpInv,MatmInv); + this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag); + } +template +void CayleyFermion5D::MooeeInternalCompute(int dag, int inv, + Vector > & Matp, + Vector > & Matm) +{ + int Ls=this->Ls; + + GridBase *grid = this->FermionRedBlackGrid(); + int LLs = grid->_rdimensions[0]; + + if ( LLs == Ls ) return; // Not vectorised in 5th direction + + Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls); + Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls); + + for(int s=0;s::iscomplex() ) { + sp[l] = PplusMat (l*istride+s1*ostride,s2); + sm[l] = PminusMat(l*istride+s1*ostride,s2); + } else { + // if real + scalar_type tmp; + tmp = PplusMat (l*istride+s1*ostride,s2); + sp[l] = scalar_type(tmp.real(),tmp.real()); + tmp = PminusMat(l*istride+s1*ostride,s2); + sm[l] = scalar_type(tmp.real(),tmp.real()); + } + } + Matp[LLs*s2+s1] = Vp; + Matm[LLs*s2+s1] = Vm; + }} +} + FermOpTemplateInstantiate(CayleyFermion5D); GparityFermOpTemplateInstantiate(CayleyFermion5D); diff --git a/lib/qcd/action/fermion/CayleyFermion5D.h b/lib/qcd/action/fermion/CayleyFermion5D.h index 0eb68034..2392fcf0 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.h +++ b/lib/qcd/action/fermion/CayleyFermion5D.h @@ -33,6 +33,11 @@ namespace Grid { namespace QCD { + template struct switcheroo { static int iscomplex() { return 0; } }; + template<> struct switcheroo { static int iscomplex() { return 1; } }; + template<> struct switcheroo { static int iscomplex() { return 1; } }; + + template class CayleyFermion5D : public WilsonFermion5D { @@ -75,11 +80,18 @@ namespace Grid { std::vector &lower, std::vector &diag, std::vector &upper); + void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv); + void MooeeInternalCompute(int dag, int inv, Vector > & Matp, Vector > & Matm); + void MooeeInternalAsm(const FermionField &in, FermionField &out, int LLs, int site, Vector > &Matp, Vector > &Matm); + void MooeeInternalZAsm(const FermionField &in, FermionField &out, + int LLs, int site, + Vector > &Matp, + Vector > &Matm); virtual void Instantiatable(void)=0; @@ -117,6 +129,12 @@ namespace Grid { std::vector ueem; std::vector dee; + // Matrices of 5d ee inverse params + Vector > MatpInv; + Vector > MatmInv; + Vector > MatpInvDag; + Vector > MatmInvDag; + // Constructors CayleyFermion5D(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index 29f10b0a..6d07d5de 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -29,7 +29,7 @@ Author: paboyle *************************************************************************************/ /* END LEGAL */ -#include + #include @@ -343,7 +343,7 @@ void CayleyFermion5D::MooeeInternalAsm(const FermionField &psi, FermionFie Vector > &Matp, Vector > &Matm) { -#if 0 +#ifndef AVX512 { SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; @@ -485,6 +485,177 @@ void CayleyFermion5D::MooeeInternalAsm(const FermionField &psi, FermionFie #endif }; + // Z-mobius version +template +void CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi, FermionField &chi, + int LLs, int site, Vector > &Matp, Vector > &Matm) +{ +#if 1 + { + SiteHalfSpinor BcastP; + SiteHalfSpinor BcastM; + SiteHalfSpinor SiteChiP; + SiteHalfSpinor SiteChiM; + + // Ls*Ls * 2 * 12 * vol flops + for(int s1=0;s1); + for(int s1=0;s1 void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv) { @@ -494,118 +665,41 @@ void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField chi.checkerboard=psi.checkerboard; - Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls); - Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls); + Vector > Matp; + Vector > Matm; + Vector > *_Matp; + Vector > *_Matm; - for(int s=0;s > Matp(Ls*LLs); - Vector > Matm(Ls*LLs); + assert(_Matp->size()==Ls*LLs); - for(int s2=0;s2 SitePplus(LLs); - std::vector SitePminus(LLs); - std::vector SiteChiP(LLs); - std::vector SiteChiM(LLs); - std::vector SiteChi(LLs); - -#pragma omp for - for(auto site=0;site::iscomplex() ) { PARALLEL_FOR_LOOP - for(auto site=0;site::MooeeInternal(const Fermion template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); + }} From fe187e9ed3960772417cc49845abb6a18ecdaaf0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 10 Dec 2016 00:47:48 +0000 Subject: [PATCH 07/18] Compiles and passes under ZMobius with assembler --- lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 194 ++++++++++++------- 1 file changed, 125 insertions(+), 69 deletions(-) diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index 6d07d5de..91ab386a 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -336,7 +336,6 @@ PARALLEL_FOR_LOOP #include #include - template void CayleyFermion5D::MooeeInternalAsm(const FermionField &psi, FermionField &chi, int LLs, int site, @@ -482,6 +481,31 @@ void CayleyFermion5D::MooeeInternalAsm(const FermionField &psi, FermionFie } } } +#undef Chi_00 +#undef Chi_01 +#undef Chi_02 +#undef Chi_10 +#undef Chi_11 +#undef Chi_12 +#undef Chi_20 +#undef Chi_21 +#undef Chi_22 +#undef Chi_30 +#undef Chi_31 +#undef Chi_32 + +#undef BCAST0 +#undef BCAST1 +#undef BCAST2 +#undef BCAST3 +#undef BCAST4 +#undef BCAST5 +#undef BCAST6 +#undef BCAST7 +#undef BCAST8 +#undef BCAST9 +#undef BCAST10 +#undef BCAST11 #endif }; @@ -541,31 +565,31 @@ void CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi, FermionFi { // pointers // MASK_REGS; -#define Chi_00 %%zmm0 -#define Chi_01 %%zmm1 -#define Chi_02 %%zmm2 -#define Chi_10 %%zmm3 -#define Chi_11 %%zmm4 -#define Chi_12 %%zmm5 -#define Chi_20 %%zmm6 -#define Chi_21 %%zmm7 -#define Chi_22 %%zmm8 -#define Chi_30 %%zmm9 -#define Chi_31 %%zmm10 -#define Chi_32 %%zmm11 +#define Chi_00 %zmm0 +#define Chi_01 %zmm1 +#define Chi_02 %zmm2 +#define Chi_10 %zmm3 +#define Chi_11 %zmm4 +#define Chi_12 %zmm5 +#define Chi_20 %zmm6 +#define Chi_21 %zmm7 +#define Chi_22 %zmm8 +#define Chi_30 %zmm9 +#define Chi_31 %zmm10 +#define Chi_32 %zmm11 -#define BCAST0 %%zmm12 -#define BCAST1 %%zmm13 -#define BCAST2 %%zmm14 -#define BCAST3 %%zmm15 -#define BCAST4 %%zmm16 -#define BCAST5 %%zmm17 -#define BCAST6 %%zmm18 -#define BCAST7 %%zmm19 -#define BCAST8 %%zmm20 -#define BCAST9 %%zmm21 -#define BCAST10 %%zmm22 -#define BCAST11 %%zmm23 +#define BCAST0 %zmm12 +#define BCAST1 %zmm13 +#define BCAST2 %zmm14 +#define BCAST3 %zmm15 +#define BCAST4 %zmm16 +#define BCAST5 %zmm17 +#define BCAST6 %zmm18 +#define BCAST7 %zmm19 +#define BCAST8 %zmm20 +#define BCAST9 %zmm21 +#define BCAST10 %zmm22 +#define BCAST11 %zmm23 int incr=LLs*LLs*sizeof(iSinglet); for(int s1=0;s1::MooeeInternalZAsm(const FermionField &psi, FermionFi uint64_t a2 = (uint64_t)&psi[lex]; for(int l=0; l::MooeeInternalZAsm(const FermionField &psi, FermionFi } } } +#undef Chi_00 +#undef Chi_01 +#undef Chi_02 +#undef Chi_10 +#undef Chi_11 +#undef Chi_12 +#undef Chi_20 +#undef Chi_21 +#undef Chi_22 +#undef Chi_30 +#undef Chi_31 +#undef Chi_32 + +#undef BCAST0 +#undef BCAST1 +#undef BCAST2 +#undef BCAST3 +#undef BCAST4 +#undef BCAST5 +#undef BCAST6 +#undef BCAST7 +#undef BCAST8 +#undef BCAST9 +#undef BCAST10 +#undef BCAST11 + #endif }; From 55cb22ad674610d12bff674e6a0d47c0f14be097 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 18 Dec 2016 00:55:37 +0000 Subject: [PATCH 08/18] Z mobius bmark --- benchmarks/Benchmark_mooee.cc | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index e8f0d16b..1e51c9d2 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -113,6 +113,20 @@ int main (int argc, char ** argv) std::cout<Barrier(); \ + t0=usecond(); \ + for(int i=0;iBarrier(); \ + zDw.CayleyReport(); \ + std::cout< gamma(Ls,std::complex(1.0,0.0)); + ZMobiusFermionVec5dR zDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5,gamma,b,c); + std::cout<Barrier(); @@ -193,6 +213,9 @@ int main (int argc, char ** argv) BENCH_DW(Mooee ,src_o,r_o); BENCH_DW(MooeeInv,src_o,r_o); + BENCH_ZDW(Mooee ,src_o,r_o); + BENCH_ZDW(MooeeInv,src_o,r_o); + } Grid_finalize(); From fa6acccf556480f1eff84784750c350cc8c3f672 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 18 Dec 2016 00:56:19 +0000 Subject: [PATCH 09/18] Zmobius asm --- lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 193 ++++++++++++------- 1 file changed, 119 insertions(+), 74 deletions(-) diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index 91ab386a..38bceafe 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -514,7 +514,8 @@ template void CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi, FermionField &chi, int LLs, int site, Vector > &Matp, Vector > &Matm) { -#if 1 +#ifndef AVX512 + //#if 0 { SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; @@ -542,12 +543,13 @@ void CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi, FermionFi for(int co=0;co::MooeeInternalZAsm(const FermionField &psi, FermionFi #else { // pointers - // MASK_REGS; + // MASK_REGS; #define Chi_00 %zmm0 #define Chi_01 %zmm1 #define Chi_02 %zmm2 @@ -577,20 +579,37 @@ void CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi, FermionFi #define Chi_30 %zmm9 #define Chi_31 %zmm10 #define Chi_32 %zmm11 +#define pChi_00 %%zmm0 +#define pChi_01 %%zmm1 +#define pChi_02 %%zmm2 +#define pChi_10 %%zmm3 +#define pChi_11 %%zmm4 +#define pChi_12 %%zmm5 +#define pChi_20 %%zmm6 +#define pChi_21 %%zmm7 +#define pChi_22 %%zmm8 +#define pChi_30 %%zmm9 +#define pChi_31 %%zmm10 +#define pChi_32 %%zmm11 -#define BCAST0 %zmm12 -#define BCAST1 %zmm13 -#define BCAST2 %zmm14 -#define BCAST3 %zmm15 -#define BCAST4 %zmm16 -#define BCAST5 %zmm17 -#define BCAST6 %zmm18 -#define BCAST7 %zmm19 -#define BCAST8 %zmm20 -#define BCAST9 %zmm21 -#define BCAST10 %zmm22 -#define BCAST11 %zmm23 +#define BCAST_00 %zmm12 +#define SHUF_00 %zmm13 +#define BCAST_01 %zmm14 +#define SHUF_01 %zmm15 +#define BCAST_02 %zmm16 +#define SHUF_02 %zmm17 +#define BCAST_10 %zmm18 +#define SHUF_10 %zmm19 +#define BCAST_11 %zmm20 +#define SHUF_11 %zmm21 +#define BCAST_12 %zmm22 +#define SHUF_12 %zmm23 +#define Mp %zmm24 +#define Mps %zmm25 +#define Mm %zmm26 +#define Mms %zmm27 +#define N 8 int incr=LLs*LLs*sizeof(iSinglet); for(int s1=0;s1::MooeeInternalZAsm(const FermionField &psi, FermionFi LOAD64(%r9,a1); LOAD64(%r10,a2); asm ( - VPREFETCH1(0,%r10) VPREFETCH1(0,%r9) - VPREFETCH1(12,%r10) VPREFETCH1(13,%r10) - VPREFETCH1(14,%r10) VPREFETCH1(15,%r10) - VBCASTCDUP(0,%r10,BCAST0) VBCASTCDUP(1,%r10,BCAST1) - VBCASTCDUP(2,%r10,BCAST2) VBCASTCDUP(3,%r10,BCAST3) - VBCASTCDUP(4,%r10,BCAST4) VBCASTCDUP(5,%r10,BCAST5) - VBCASTCDUP(6,%r10,BCAST6) VBCASTCDUP(7,%r10,BCAST7) - VBCASTCDUP(8,%r10,BCAST8) VBCASTCDUP(9,%r10,BCAST9) - VBCASTCDUP(10,%r10,BCAST10) VBCASTCDUP(11,%r10,BCAST11) - VMULIDUP (0,%r8,BCAST0,Chi_00) VMULIDUP(0,%r8,BCAST1,Chi_01) // II RI from Mat / Psi - VMULIDUP (0,%r8,BCAST2,Chi_02) VMULIDUP(0,%r8,BCAST3,Chi_10) - VMULIDUP (0,%r8,BCAST4,Chi_11) VMULIDUP(0,%r8,BCAST5,Chi_12) - VMULIDUP (0,%r9,BCAST6,Chi_20) VMULIDUP(0,%r9,BCAST7,Chi_21) - VMULIDUP (0,%r9,BCAST8,Chi_22) VMULIDUP(0,%r9,BCAST9,Chi_30) - VMULIDUP (0,%r9,BCAST10,Chi_31) VMULIDUP(0,%r9,BCAST11,Chi_32) - VSHUF(BCAST0,BCAST0) VSHUF(BCAST1,BCAST1) - VSHUF(BCAST2,BCAST2) VSHUF(BCAST3,BCAST3) - VSHUF(BCAST4,BCAST4) VSHUF(BCAST5,BCAST5) - VSHUF(BCAST6,BCAST6) VSHUF(BCAST7,BCAST7) - VSHUF(BCAST8,BCAST8) VSHUF(BCAST9,BCAST9) - VSHUF(BCAST10,BCAST10) VSHUF(BCAST11,BCAST11) - VMADDSUBRDUP(0,%r8,BCAST0,Chi_00) VMADDSUBRDUP(0,%r8,BCAST1,Chi_01) - VMADDSUBRDUP(0,%r8,BCAST2,Chi_02) VMADDSUBRDUP(0,%r8,BCAST3,Chi_10) - VMADDSUBRDUP(0,%r8,BCAST4,Chi_11) VMADDSUBRDUP(0,%r8,BCAST5,Chi_12) - VMADDSUBRDUP(0,%r9,BCAST6,Chi_20) VMADDSUBRDUP(0,%r9,BCAST7,Chi_21) - VMADDSUBRDUP(0,%r9,BCAST8,Chi_22) VMADDSUBRDUP(0,%r9,BCAST9,Chi_30) - VMADDSUBRDUP(0,%r9,BCAST10,Chi_31) VMADDSUBRDUP(0,%r9,BCAST11,Chi_32) ); + VLOAD(0,%r8,Mp)// i r + VLOAD(0,%r9,Mm) + VSHUF(Mp,Mps) // r i + VSHUF(Mm,Mms) + VPREFETCH1(12,%r10) VPREFETCH1(13,%r10) + VPREFETCH1(14,%r10) VPREFETCH1(15,%r10) + VMULIDUP(0*N,%r10,Mps,Chi_00) + VMULIDUP(1*N,%r10,Mps,Chi_01) + VMULIDUP(2*N,%r10,Mps,Chi_02) + VMULIDUP(3*N,%r10,Mps,Chi_10) + VMULIDUP(4*N,%r10,Mps,Chi_11) + VMULIDUP(5*N,%r10,Mps,Chi_12) + + VMULIDUP(6*N ,%r10,Mms,Chi_20) + VMULIDUP(7*N ,%r10,Mms,Chi_21) + VMULIDUP(8*N ,%r10,Mms,Chi_22) + VMULIDUP(9*N ,%r10,Mms,Chi_30) + VMULIDUP(10*N,%r10,Mms,Chi_31) + VMULIDUP(11*N,%r10,Mms,Chi_32) + + VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) + VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) + VMADDSUBRDUP(2*N,%r10,Mp,Chi_02) + VMADDSUBRDUP(3*N,%r10,Mp,Chi_10) + VMADDSUBRDUP(4*N,%r10,Mp,Chi_11) + VMADDSUBRDUP(5*N,%r10,Mp,Chi_12) + + VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20) + VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21) + VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22) + VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30) + VMADDSUBRDUP(10*N,%r10,Mm,Chi_31) + VMADDSUBRDUP(11*N,%r10,Mm,Chi_32) + ); } else { LOAD64(%r8,a0); LOAD64(%r9,a1); LOAD64(%r10,a2); asm ( - VPREFETCH1(0,%r10) VPREFETCH1(0,%r9) - VPREFETCH1(12,%r10) VPREFETCH1(13,%r10) - VPREFETCH1(14,%r10) VPREFETCH1(15,%r10) - VBCASTCDUP(0,%r10,BCAST0) VBCASTCDUP(1,%r10,BCAST1) - VBCASTCDUP(2,%r10,BCAST2) VBCASTCDUP(3,%r10,BCAST3) - VBCASTCDUP(4,%r10,BCAST4) VBCASTCDUP(5,%r10,BCAST5) - VBCASTCDUP(6,%r10,BCAST6) VBCASTCDUP(7,%r10,BCAST7) - VBCASTCDUP(8,%r10,BCAST8) VBCASTCDUP(9,%r10,BCAST9) - VBCASTCDUP(10,%r10,BCAST10) VBCASTCDUP(11,%r10,BCAST11) - VMADDSUBIDUP (0,%r8,BCAST0,Chi_00) VMADDSUBIDUP(0,%r8,BCAST1,Chi_01) // II RI from Mat / Psi - VMADDSUBIDUP (0,%r8,BCAST2,Chi_02) VMADDSUBIDUP(0,%r8,BCAST3,Chi_10) - VMADDSUBIDUP (0,%r8,BCAST4,Chi_11) VMADDSUBIDUP(0,%r8,BCAST5,Chi_12) - VMADDSUBIDUP (0,%r9,BCAST6,Chi_20) VMADDSUBIDUP(0,%r9,BCAST7,Chi_21) - VMADDSUBIDUP (0,%r9,BCAST8,Chi_22) VMADDSUBIDUP(0,%r9,BCAST9,Chi_30) - VMADDSUBIDUP (0,%r9,BCAST10,Chi_31) VMADDSUBIDUP(0,%r9,BCAST11,Chi_32) - VSHUF(BCAST0,BCAST0) VSHUF(BCAST1,BCAST1) - VSHUF(BCAST2,BCAST2) VSHUF(BCAST3,BCAST3) - VSHUF(BCAST4,BCAST4) VSHUF(BCAST5,BCAST5) - VSHUF(BCAST6,BCAST6) VSHUF(BCAST7,BCAST7) - VSHUF(BCAST8,BCAST8) VSHUF(BCAST9,BCAST9) - VSHUF(BCAST10,BCAST10) VSHUF(BCAST11,BCAST11) - VMADDSUBRDUP(0,%r8,BCAST0,Chi_00) VMADDSUBRDUP(0,%r8,BCAST1,Chi_01) - VMADDSUBRDUP(0,%r8,BCAST2,Chi_02) VMADDSUBRDUP(0,%r8,BCAST3,Chi_10) - VMADDSUBRDUP(0,%r8,BCAST4,Chi_11) VMADDSUBRDUP(0,%r8,BCAST5,Chi_12) - VMADDSUBRDUP(0,%r9,BCAST6,Chi_20) VMADDSUBRDUP(0,%r9,BCAST7,Chi_21) - VMADDSUBRDUP(0,%r9,BCAST8,Chi_22) VMADDSUBRDUP(0,%r9,BCAST9,Chi_30) - VMADDSUBRDUP(0,%r9,BCAST10,Chi_31) VMADDSUBRDUP(0,%r9,BCAST11,Chi_32) - ); + VLOAD(0,%r8,Mp) + VSHUF(Mp,Mps) + + VLOAD(0,%r9,Mm) + VSHUF(Mm,Mms) + + VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) // Mri * Pii +- Cir + VMADDSUBIDUP(1*N,%r10,Mps,Chi_01) + VMADDSUBIDUP(2*N,%r10,Mps,Chi_02) + VMADDSUBIDUP(3*N,%r10,Mps,Chi_10) + VMADDSUBIDUP(4*N,%r10,Mps,Chi_11) + VMADDSUBIDUP(5*N,%r10,Mps,Chi_12) + + VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20) + VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21) + VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22) + VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30) + VMADDSUBIDUP(10*N,%r10,Mms,Chi_31) + VMADDSUBIDUP(11*N,%r10,Mms,Chi_32) + + VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) // Cir = Mir * Prr +- ( Mri * Pii +- Cir) + VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) // Ci = MiPr + Ci + MrPi ; Cr = MrPr - ( MiPi - Cr) + VMADDSUBRDUP(2*N,%r10,Mp,Chi_02) + VMADDSUBRDUP(3*N,%r10,Mp,Chi_10) + VMADDSUBRDUP(4*N,%r10,Mp,Chi_11) + VMADDSUBRDUP(5*N,%r10,Mp,Chi_12) + + VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20) + VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21) + VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22) + VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30) + VMADDSUBRDUP(10*N,%r10,Mm,Chi_31) + VMADDSUBRDUP(11*N,%r10,Mm,Chi_32) + ); } a0 = a0+incr; a1 = a1+incr; @@ -672,13 +704,26 @@ void CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi, FermionFi }} { int lexa = s1+LLs*site; + /* + SiteSpinor tmp; asm ( - VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02) - VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12) - VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22) - VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32) + VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02) + VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12) + VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22) + VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32) + : : "r" ((uint64_t)&tmp) : "memory" ); + */ + + asm ( + VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02) + VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12) + VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22) + VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32) : : "r" ((uint64_t)&chi[lexa]) : "memory" ); + // if ( 1 || (site==0) ) { + // std::cout< Date: Sun, 18 Dec 2016 01:27:34 +0000 Subject: [PATCH 10/18] Bad commit fixed --- lib/simd/Grid_avx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index e2729187..724f52bb 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -228,7 +228,7 @@ namespace Optimization { struct MaddRealPart{ inline __m256 operator()(__m256 a, __m256 b, __m256 c){ __m256 ymm0 = _mm256_moveldup_ps(a); // ymm0 <- ar ar, - _mm256_add_ps(_mm256_mul_ps( ymm0, b),c); + return _mm256_add_ps(_mm256_mul_ps( ymm0, b),c); } inline __m256d operator()(__m256d a, __m256d b, __m256d c){ __m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 ); From 87be03006abe7d6da0ca966bed84e63bbba95a41 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Dec 2016 01:45:09 +0000 Subject: [PATCH 11/18] AVX 512 code broke other compiles; fixing --- lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index 38bceafe..f8c64b91 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -332,9 +332,11 @@ PARALLEL_FOR_LOOP } +#ifdef AVX512 #include #include #include +#endif template void CayleyFermion5D::MooeeInternalAsm(const FermionField &psi, FermionField &chi, @@ -515,7 +517,6 @@ void CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi, FermionFi int LLs, int site, Vector > &Matp, Vector > &Matm) { #ifndef AVX512 - //#if 0 { SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; From 3e6945cd656e3703eb5042564aec71d8bb9f1b78 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Dec 2016 02:05:11 +0000 Subject: [PATCH 12/18] Fixing AVX Z-mobius --- lib/qcd/action/fermion/CayleyFermion5D.h | 26 ++++++++-- lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 52 ++++++++++---------- 2 files changed, 49 insertions(+), 29 deletions(-) diff --git a/lib/qcd/action/fermion/CayleyFermion5D.h b/lib/qcd/action/fermion/CayleyFermion5D.h index 2392fcf0..86255be6 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.h +++ b/lib/qcd/action/fermion/CayleyFermion5D.h @@ -33,9 +33,29 @@ namespace Grid { namespace QCD { - template struct switcheroo { static int iscomplex() { return 0; } }; - template<> struct switcheroo { static int iscomplex() { return 1; } }; - template<> struct switcheroo { static int iscomplex() { return 1; } }; + template struct switcheroo { + static inline int iscomplex() { return 0; } + + template + static inline vec mult(vec a, vec b) { + return real_mult(a,b); + } + }; + template<> struct switcheroo { + static inline int iscomplex() { return 1; } + + template + static inline vec mult(vec a, vec b) { + return a*b; + } + }; + template<> struct switcheroo { + static inline int iscomplex() { return 1; } + template + static inline vec mult(vec a, vec b) { + return a*b; + } + }; template diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index f8c64b91..ed742ea3 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -161,18 +161,18 @@ PARALLEL_FOR_LOOP } // Can force these to real arithmetic and save 2x. - Simd p_00 = real_mult(d[v]()()(), phi[ss+v]()(0)(0)) + real_mult(l[v]()()(),hm_00); - Simd p_01 = real_mult(d[v]()()(), phi[ss+v]()(0)(1)) + real_mult(l[v]()()(),hm_01); - Simd p_02 = real_mult(d[v]()()(), phi[ss+v]()(0)(2)) + real_mult(l[v]()()(),hm_02); - Simd p_10 = real_mult(d[v]()()(), phi[ss+v]()(1)(0)) + real_mult(l[v]()()(),hm_10); - Simd p_11 = real_mult(d[v]()()(), phi[ss+v]()(1)(1)) + real_mult(l[v]()()(),hm_11); - Simd p_12 = real_mult(d[v]()()(), phi[ss+v]()(1)(2)) + real_mult(l[v]()()(),hm_12); - Simd p_20 = real_mult(d[v]()()(), phi[ss+v]()(2)(0)) + real_mult(u[v]()()(),hp_00); - Simd p_21 = real_mult(d[v]()()(), phi[ss+v]()(2)(1)) + real_mult(u[v]()()(),hp_01); - Simd p_22 = real_mult(d[v]()()(), phi[ss+v]()(2)(2)) + real_mult(u[v]()()(),hp_02); - Simd p_30 = real_mult(d[v]()()(), phi[ss+v]()(3)(0)) + real_mult(u[v]()()(),hp_10); - Simd p_31 = real_mult(d[v]()()(), phi[ss+v]()(3)(1)) + real_mult(u[v]()()(),hp_11); - Simd p_32 = real_mult(d[v]()()(), phi[ss+v]()(3)(2)) + real_mult(u[v]()()(),hp_12); + Simd p_00 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(),hm_00); + Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(),hm_01); + Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(),hm_02); + Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(),hm_10); + Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(),hm_11); + Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(),hm_12); + Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(),hp_00); + Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(),hp_01); + Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(),hp_02); + Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(),hp_10); + Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(),hp_11); + Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(),hp_12); vstream(chi[ss+v]()(0)(0),p_00); vstream(chi[ss+v]()(0)(1),p_01); @@ -299,19 +299,19 @@ PARALLEL_FOR_LOOP hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); } - Simd p_00 = real_mult(d[v]()()(), phi[ss+v]()(0)(0)) + real_mult(u[v]()()(),hp_00); - Simd p_01 = real_mult(d[v]()()(), phi[ss+v]()(0)(1)) + real_mult(u[v]()()(),hp_01); - Simd p_02 = real_mult(d[v]()()(), phi[ss+v]()(0)(2)) + real_mult(u[v]()()(),hp_02); - Simd p_10 = real_mult(d[v]()()(), phi[ss+v]()(1)(0)) + real_mult(u[v]()()(),hp_10); - Simd p_11 = real_mult(d[v]()()(), phi[ss+v]()(1)(1)) + real_mult(u[v]()()(),hp_11); - Simd p_12 = real_mult(d[v]()()(), phi[ss+v]()(1)(2)) + real_mult(u[v]()()(),hp_12); + Simd p_00 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(),hp_00); + Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(),hp_01); + Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(),hp_02); + Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(),hp_10); + Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(),hp_11); + Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(),hp_12); - Simd p_20 = real_mult(d[v]()()(), phi[ss+v]()(2)(0)) + real_mult(l[v]()()(),hm_00); - Simd p_21 = real_mult(d[v]()()(), phi[ss+v]()(2)(1)) + real_mult(l[v]()()(),hm_01); - Simd p_22 = real_mult(d[v]()()(), phi[ss+v]()(2)(2)) + real_mult(l[v]()()(),hm_02); - Simd p_30 = real_mult(d[v]()()(), phi[ss+v]()(3)(0)) + real_mult(l[v]()()(),hm_10); - Simd p_31 = real_mult(d[v]()()(), phi[ss+v]()(3)(1)) + real_mult(l[v]()()(),hm_11); - Simd p_32 = real_mult(d[v]()()(), phi[ss+v]()(3)(2)) + real_mult(l[v]()()(),hm_12); + Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(),hm_00); + Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(),hm_01); + Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(),hm_02); + Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(),hm_10); + Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(),hm_11); + Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(),hm_12); vstream(chi[ss+v]()(0)(0),p_00); vstream(chi[ss+v]()(0)(1),p_01); @@ -544,13 +544,13 @@ void CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi, FermionFi for(int co=0;co Date: Sun, 18 Dec 2016 02:07:45 +0000 Subject: [PATCH 13/18] Precision error --- lib/simd/Grid_sse4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index abd688ab..398a8691 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -194,7 +194,7 @@ namespace Optimization { __m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, _mm_add_ps(_mm_mul_ps( ymm0, b),c); } - inline __m128d operator()(__m128d a, __m128d b, __m128 c){ + inline __m128d operator()(__m128d a, __m128d b, __m128d c){ __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 ); return _mm_add_pd(_mm_mul_pd( ymm0, b),c); } From 629f43e36c14fe60eba53a651cafac7d3862a7fc Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Dec 2016 02:09:37 +0000 Subject: [PATCH 14/18] Return statement needed --- lib/simd/Grid_sse4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 398a8691..090f0cc9 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -192,7 +192,7 @@ namespace Optimization { struct MaddRealPart{ inline __m128 operator()(__m128 a, __m128 b, __m128 c){ __m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, - _mm_add_ps(_mm_mul_ps( ymm0, b),c); + return _mm_add_ps(_mm_mul_ps( ymm0, b),c); } inline __m128d operator()(__m128d a, __m128d b, __m128d c){ __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 ); From 4b220972ac1d7551c27f0ca8c1d535487530400f Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Dec 2016 02:14:17 +0000 Subject: [PATCH 15/18] Warning fix --- lib/simd/Grid_sse4.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 090f0cc9..943756b2 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -348,9 +348,11 @@ namespace Optimization { } } +#ifndef _mm_alignr_epi64 #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) - +#endif + template static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; template static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; From a59f5374d724b40cc08908577acffc3ea6d50a44 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Dec 2016 02:23:55 +0000 Subject: [PATCH 16/18] Evade warning --- lib/PerfCount.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/PerfCount.h b/lib/PerfCount.h index 5ab07c02..749441c5 100644 --- a/lib/PerfCount.h +++ b/lib/PerfCount.h @@ -205,12 +205,13 @@ public: void Stop(void) { count=0; cycles=0; + size_t ign; #ifdef __linux__ if ( fd!= -1) { ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0); - ::read(fd, &count, sizeof(long long)); - ::read(cyclefd, &cycles, sizeof(long long)); + ign=::read(fd, &count, sizeof(long long)); + ign=::read(cyclefd, &cycles, sizeof(long long)); } elapsed = cyclecount() - begin; #else From 8a337f307074a81e429c24ce00403bbe434d302e Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 18 Dec 2016 02:35:31 +0000 Subject: [PATCH 17/18] Move cayley into mainstream tests --- tests/{debug => }/Test_cayley_even_odd_vec.cc | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{debug => }/Test_cayley_even_odd_vec.cc (100%) diff --git a/tests/debug/Test_cayley_even_odd_vec.cc b/tests/Test_cayley_even_odd_vec.cc similarity index 100% rename from tests/debug/Test_cayley_even_odd_vec.cc rename to tests/Test_cayley_even_odd_vec.cc From f8d11ff67315f3030733a74b55cb0ab9a5cf538c Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 20 Dec 2016 12:31:49 +0100 Subject: [PATCH 18/18] better serialisable enums (can be encapsulated into classes) --- lib/serialisation/BaseIO.h | 39 +--------- lib/serialisation/MacroMagic.h | 130 +++++++++++++++++---------------- 2 files changed, 72 insertions(+), 97 deletions(-) diff --git a/lib/serialisation/BaseIO.h b/lib/serialisation/BaseIO.h index 7761a8e6..1095baf1 100644 --- a/lib/serialisation/BaseIO.h +++ b/lib/serialisation/BaseIO.h @@ -83,12 +83,7 @@ namespace Grid { typename std::enable_if::value, void>::type write(const std::string& s, const U &output); template - typename std::enable_if::value, void>::type - write(const std::string& s, const U &output); - template - typename std::enable_if< - !(std::is_base_of::value or std::is_enum::value), - void>::type + typename std::enable_if::value, void>::type write(const std::string& s, const U &output); private: T *upcast; @@ -107,12 +102,7 @@ namespace Grid { typename std::enable_if::value, void>::type read(const std::string& s, U &output); template - typename std::enable_if::value, void>::type - read(const std::string& s, U &output); - template - typename std::enable_if< - !(std::is_base_of::value or std::is_enum::value), - void>::type + typename std::enable_if::value, void>::type read(const std::string& s, U &output); protected: template @@ -221,17 +211,7 @@ namespace Grid { template template - typename std::enable_if::value, void>::type - Writer::write(const std::string &s, const U &output) - { - EnumIO::write(*this, s, output); - } - - template - template - typename std::enable_if< - !(std::is_base_of::value or std::is_enum::value), - void>::type + typename std::enable_if::value, void>::type Writer::write(const std::string &s, const U &output) { upcast->writeDefault(s, output); @@ -266,17 +246,7 @@ namespace Grid { template template - typename std::enable_if::value, void>::type - Reader::read(const std::string &s, U &output) - { - EnumIO::read(*this, s, output); - } - - template - template - typename std::enable_if< - !(std::is_base_of::value or std::is_enum::value), - void>::type + typename std::enable_if::value, void>::type Reader::read(const std::string &s, U &output) { upcast->readDefault(s, output); @@ -300,7 +270,6 @@ namespace Grid { abort(); } } - } #endif diff --git a/lib/serialisation/MacroMagic.h b/lib/serialisation/MacroMagic.h index c78bba0c..c9137dfe 100644 --- a/lib/serialisation/MacroMagic.h +++ b/lib/serialisation/MacroMagic.h @@ -114,35 +114,33 @@ THE SOFTWARE. #define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B); #define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...) \ - \ - \ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__)) \ - \ - \ - template \ - static inline void write(Writer &WR,const std::string &s, const cname &obj){ \ - push(WR,s);\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \ - pop(WR);\ - } \ - \ - \ - template \ - static inline void read(Reader &RD,const std::string &s, cname &obj){ \ - push(RD,s);\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \ - pop(RD);\ - } \ - \ - \ - friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \ - os<<"class "<<#cname<<" {"<\ +static inline void write(Writer &WR,const std::string &s, const cname &obj){ \ + push(WR,s);\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \ + pop(WR);\ +} \ +\ +\ +template \ +static inline void read(Reader &RD,const std::string &s, cname &obj){ \ + push(RD,s);\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \ + pop(RD);\ +} \ +\ +\ +friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \ + os<<"class "<<#cname<<" {"<::type #define GRID_MACRO_ENUMVAL(A,B) A = B, @@ -150,44 +148,52 @@ THE SOFTWARE. #define GRID_MACRO_ENUMTEST(A,B) else if (buf == #A) {obj = GRID_ENUM_TYPE(obj)::A;} #define GRID_MACRO_ENUMCASEIO(A,B) case GRID_ENUM_TYPE(obj)::A: os << #A; break; -namespace Grid { - template - class EnumIO {}; -} - #define GRID_SERIALIZABLE_ENUM(name,undefname,...)\ - enum class name {\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\ - undefname = -1\ +class name: public Serializable\ +{\ +public:\ + enum EnumType\ + {\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\ + undefname = -1\ };\ +public:\ + name(void): value_(undefname) {};\ + name(EnumType value): value_(value) {};\ + template \ + static inline void write(Writer &WR,const std::string &s, const name &obj)\ + {\ + switch (obj.value_)\ + {\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\ + default: Grid::write(WR,s,#undefname); break;\ + }\ + }\ \ - template<>\ - class EnumIO {\ - public:\ - template \ - static inline void write(Writer &WR,const std::string &s, const name &obj){ \ - switch (obj) {\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\ - default: Grid::write(WR,s,#undefname); break;\ - }\ - }\ - \ - template \ - static inline void read(Reader &RD,const std::string &s, name &obj){ \ - std::string buf;\ - Grid::read(RD, s, buf);\ - if (buf == #undefname) {obj = name::undefname;}\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\ - else {obj = name::undefname;}\ - }\ - };\ - \ - inline std::ostream & operator << (std::ostream &os, const name &obj ) { \ + template \ + static inline void read(Reader &RD,const std::string &s, name &obj)\ + {\ + std::string buf;\ + Grid::read(RD, s, buf);\ + if (buf == #undefname) {obj = name::undefname;}\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\ + else {obj = name::undefname;}\ + }\ + inline operator EnumType(void) const\ + {\ + return value_;\ + }\ + inline friend std::ostream & operator<<(std::ostream &os, const name &obj)\ + {\ switch (obj) {\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\ - default: os << #undefname; break;\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\ + default: os << #undefname; break;\ }\ return os;\ - }; + }\ +private:\ + EnumType value_;\ +}; + #endif