1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Compiles and passes under ZMobius with assembler

This commit is contained in:
Peter Boyle 2016-12-10 00:47:48 +00:00
parent 0091b50f49
commit fe187e9ed3

View File

@ -336,7 +336,6 @@ PARALLEL_FOR_LOOP
#include <simd/Intel512avx.h>
#include <simd/Intel512single.h>
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
int LLs, int site,
@ -482,6 +481,31 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie
}
}
}
#undef Chi_00
#undef Chi_01
#undef Chi_02
#undef Chi_10
#undef Chi_11
#undef Chi_12
#undef Chi_20
#undef Chi_21
#undef Chi_22
#undef Chi_30
#undef Chi_31
#undef Chi_32
#undef BCAST0
#undef BCAST1
#undef BCAST2
#undef BCAST3
#undef BCAST4
#undef BCAST5
#undef BCAST6
#undef BCAST7
#undef BCAST8
#undef BCAST9
#undef BCAST10
#undef BCAST11
#endif
};
@ -541,31 +565,31 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
{
// pointers
// MASK_REGS;
#define Chi_00 %%zmm0
#define Chi_01 %%zmm1
#define Chi_02 %%zmm2
#define Chi_10 %%zmm3
#define Chi_11 %%zmm4
#define Chi_12 %%zmm5
#define Chi_20 %%zmm6
#define Chi_21 %%zmm7
#define Chi_22 %%zmm8
#define Chi_30 %%zmm9
#define Chi_31 %%zmm10
#define Chi_32 %%zmm11
#define Chi_00 %zmm0
#define Chi_01 %zmm1
#define Chi_02 %zmm2
#define Chi_10 %zmm3
#define Chi_11 %zmm4
#define Chi_12 %zmm5
#define Chi_20 %zmm6
#define Chi_21 %zmm7
#define Chi_22 %zmm8
#define Chi_30 %zmm9
#define Chi_31 %zmm10
#define Chi_32 %zmm11
#define BCAST0 %%zmm12
#define BCAST1 %%zmm13
#define BCAST2 %%zmm14
#define BCAST3 %%zmm15
#define BCAST4 %%zmm16
#define BCAST5 %%zmm17
#define BCAST6 %%zmm18
#define BCAST7 %%zmm19
#define BCAST8 %%zmm20
#define BCAST9 %%zmm21
#define BCAST10 %%zmm22
#define BCAST11 %%zmm23
#define BCAST0 %zmm12
#define BCAST1 %zmm13
#define BCAST2 %zmm14
#define BCAST3 %zmm15
#define BCAST4 %zmm16
#define BCAST5 %zmm17
#define BCAST6 %zmm18
#define BCAST7 %zmm19
#define BCAST8 %zmm20
#define BCAST9 %zmm21
#define BCAST10 %zmm22
#define BCAST11 %zmm23
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
for(int s1=0;s1<LLs;s1++){
@ -576,65 +600,71 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
uint64_t a2 = (uint64_t)&psi[lex];
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
if ( (s2+l)==0 ) {
LOAD64(%r8,a0);
LOAD64(%r9,a1);
LOAD64(%r10,a2);
asm (
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
VBCASTCDUP(0,%2,BCAST0) VBCASTCDUP(1,%2,BCAST1)
VBCASTCDUP(2,%2,BCAST2) VBCASTCDUP(3,%2,BCAST3)
VBCASTCDUP(4,%2,BCAST4) VBCASTCDUP(5,%2,BCAST5)
VBCASTCDUP(6,%2,BCAST6) VBCASTCDUP(7,%2,BCAST7)
VBCASTCDUP(8,%2,BCAST8) VBCASTCDUP(9,%2,BCAST9)
VBCASTCDUP(10,%2,BCAST10) VBCASTCDUP(11,%2,BCAST11)
VMULIDUP (0,%0,BCAST0,Chi_00) VMULIDUP(0,%0,BCAST1,Chi_01) // II RI from Mat / Psi
VMULIDUP (0,%0,BCAST2,Chi_02) VMULIDUP(0,%0,BCAST3,Chi_10)
VMULIDUP (0,%0,BCAST4,Chi_11) VMULIDUP(0,%0,BCAST5,Chi_12)
VMULIDUP (0,%0,BCAST6,Chi_20) VMULIDUP(0,%0,BCAST7,Chi_21)
VMULIDUP (0,%0,BCAST8,Chi_22) VMULIDUP(0,%0,BCAST9,Chi_30)
VMULIDUP (0,%0,BCAST10,Chi_31) VMULIDUP(0,%0,BCAST11,Chi_32)
VPREFETCH1(0,%r10) VPREFETCH1(0,%r9)
VPREFETCH1(12,%r10) VPREFETCH1(13,%r10)
VPREFETCH1(14,%r10) VPREFETCH1(15,%r10)
VBCASTCDUP(0,%r10,BCAST0) VBCASTCDUP(1,%r10,BCAST1)
VBCASTCDUP(2,%r10,BCAST2) VBCASTCDUP(3,%r10,BCAST3)
VBCASTCDUP(4,%r10,BCAST4) VBCASTCDUP(5,%r10,BCAST5)
VBCASTCDUP(6,%r10,BCAST6) VBCASTCDUP(7,%r10,BCAST7)
VBCASTCDUP(8,%r10,BCAST8) VBCASTCDUP(9,%r10,BCAST9)
VBCASTCDUP(10,%r10,BCAST10) VBCASTCDUP(11,%r10,BCAST11)
VMULIDUP (0,%r8,BCAST0,Chi_00) VMULIDUP(0,%r8,BCAST1,Chi_01) // II RI from Mat / Psi
VMULIDUP (0,%r8,BCAST2,Chi_02) VMULIDUP(0,%r8,BCAST3,Chi_10)
VMULIDUP (0,%r8,BCAST4,Chi_11) VMULIDUP(0,%r8,BCAST5,Chi_12)
VMULIDUP (0,%r9,BCAST6,Chi_20) VMULIDUP(0,%r9,BCAST7,Chi_21)
VMULIDUP (0,%r9,BCAST8,Chi_22) VMULIDUP(0,%r9,BCAST9,Chi_30)
VMULIDUP (0,%r9,BCAST10,Chi_31) VMULIDUP(0,%r9,BCAST11,Chi_32)
VSHUF(BCAST0,BCAST0) VSHUF(BCAST1,BCAST1)
VSHUF(BCAST2,BCAST2) VSHUF(BCAST3,BCAST3)
VSHUF(BCAST4,BCAST4) VSHUF(BCAST5,BCAST5)
VSHUF(BCAST6,BCAST6) VSHUF(BCAST7,BCAST7)
VSHUF(BCAST8,BCAST8) VSHUF(BCAST9,BCAST9)
VSHUF(BCAST10,BCAST10) VSHUF(BCAST11,BCAST11)
VMADDSUBRDUP(0,%0,BCAST0,Chi_00) VMADDSUBRDUP(0,%0,BCAST1,Chi_01)
VMADDSUBRDUP(0,%0,BCAST2,Chi_02) VMADDSUBRDUP(0,%0,BCAST3,Chi_10)
VMADDSUBRDUP(0,%0,BCAST4,Chi_11) VMADDSUBRDUP(0,%0,BCAST5,Chi_12)
VMADDSUBRDUP(0,%0,BCAST6,Chi_20) VMADDSUBRDUP(0,%0,BCAST7,Chi_21)
VMADDSUBRDUP(0,%0,BCAST8,Chi_22) VMADDSUBRDUP(0,%0,BCAST9,Chi_30)
VMADDSUBRDUP(0,%0,BCAST10,Chi_31) VMADDSUBRDUP(0,%0,BCAST11,Chi_32)
: : "r" (a0), "r" (a1), "r" (a2) );
VMADDSUBRDUP(0,%r8,BCAST0,Chi_00) VMADDSUBRDUP(0,%r8,BCAST1,Chi_01)
VMADDSUBRDUP(0,%r8,BCAST2,Chi_02) VMADDSUBRDUP(0,%r8,BCAST3,Chi_10)
VMADDSUBRDUP(0,%r8,BCAST4,Chi_11) VMADDSUBRDUP(0,%r8,BCAST5,Chi_12)
VMADDSUBRDUP(0,%r9,BCAST6,Chi_20) VMADDSUBRDUP(0,%r9,BCAST7,Chi_21)
VMADDSUBRDUP(0,%r9,BCAST8,Chi_22) VMADDSUBRDUP(0,%r9,BCAST9,Chi_30)
VMADDSUBRDUP(0,%r9,BCAST10,Chi_31) VMADDSUBRDUP(0,%r9,BCAST11,Chi_32) );
} else {
LOAD64(%r8,a0);
LOAD64(%r9,a1);
LOAD64(%r10,a2);
asm (
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
VBCASTCDUP(0,%2,BCAST0) VBCASTCDUP(1,%2,BCAST1)
VBCASTCDUP(2,%2,BCAST2) VBCASTCDUP(3,%2,BCAST3)
VBCASTCDUP(4,%2,BCAST4) VBCASTCDUP(5,%2,BCAST5)
VBCASTCDUP(6,%2,BCAST6) VBCASTCDUP(7,%2,BCAST7)
VBCASTCDUP(8,%2,BCAST8) VBCASTCDUP(9,%2,BCAST9)
VBCASTCDUP(10,%2,BCAST10) VBCASTCDUP(11,%2,BCAST11)
VMADDSUBIDUP (0,%0,BCAST0,Chi_00) VMADDSUBIDUP(0,%0,BCAST1,Chi_01) // II RI from Mat / Psi
VMADDSUBIDUP (0,%0,BCAST2,Chi_02) VMADDSUBIDUP(0,%0,BCAST3,Chi_10)
VMADDSUBIDUP (0,%0,BCAST4,Chi_11) VMADDSUBIDUP(0,%0,BCAST5,Chi_12)
VMADDSUBIDUP (0,%0,BCAST6,Chi_20) VMADDSUBIDUP(0,%0,BCAST7,Chi_21)
VMADDSUBIDUP (0,%0,BCAST8,Chi_22) VMADDSUBIDUP(0,%0,BCAST9,Chi_30)
VMADDSUBIDUP (0,%0,BCAST10,Chi_31) VMADDSUBIDUP(0,%0,BCAST11,Chi_32)
VPREFETCH1(0,%r10) VPREFETCH1(0,%r9)
VPREFETCH1(12,%r10) VPREFETCH1(13,%r10)
VPREFETCH1(14,%r10) VPREFETCH1(15,%r10)
VBCASTCDUP(0,%r10,BCAST0) VBCASTCDUP(1,%r10,BCAST1)
VBCASTCDUP(2,%r10,BCAST2) VBCASTCDUP(3,%r10,BCAST3)
VBCASTCDUP(4,%r10,BCAST4) VBCASTCDUP(5,%r10,BCAST5)
VBCASTCDUP(6,%r10,BCAST6) VBCASTCDUP(7,%r10,BCAST7)
VBCASTCDUP(8,%r10,BCAST8) VBCASTCDUP(9,%r10,BCAST9)
VBCASTCDUP(10,%r10,BCAST10) VBCASTCDUP(11,%r10,BCAST11)
VMADDSUBIDUP (0,%r8,BCAST0,Chi_00) VMADDSUBIDUP(0,%r8,BCAST1,Chi_01) // II RI from Mat / Psi
VMADDSUBIDUP (0,%r8,BCAST2,Chi_02) VMADDSUBIDUP(0,%r8,BCAST3,Chi_10)
VMADDSUBIDUP (0,%r8,BCAST4,Chi_11) VMADDSUBIDUP(0,%r8,BCAST5,Chi_12)
VMADDSUBIDUP (0,%r9,BCAST6,Chi_20) VMADDSUBIDUP(0,%r9,BCAST7,Chi_21)
VMADDSUBIDUP (0,%r9,BCAST8,Chi_22) VMADDSUBIDUP(0,%r9,BCAST9,Chi_30)
VMADDSUBIDUP (0,%r9,BCAST10,Chi_31) VMADDSUBIDUP(0,%r9,BCAST11,Chi_32)
VSHUF(BCAST0,BCAST0) VSHUF(BCAST1,BCAST1)
VSHUF(BCAST2,BCAST2) VSHUF(BCAST3,BCAST3)
VSHUF(BCAST4,BCAST4) VSHUF(BCAST5,BCAST5)
VSHUF(BCAST6,BCAST6) VSHUF(BCAST7,BCAST7)
VSHUF(BCAST8,BCAST8) VSHUF(BCAST9,BCAST9)
VSHUF(BCAST10,BCAST10) VSHUF(BCAST11,BCAST11)
VMADDSUBRDUP(0,%0,BCAST0,Chi_00) VMADDSUBRDUP(0,%0,BCAST1,Chi_01)
VMADDSUBRDUP(0,%0,BCAST2,Chi_02) VMADDSUBRDUP(0,%0,BCAST3,Chi_10)
VMADDSUBRDUP(0,%0,BCAST4,Chi_11) VMADDSUBRDUP(0,%0,BCAST5,Chi_12)
VMADDSUBRDUP(0,%0,BCAST6,Chi_20) VMADDSUBRDUP(0,%0,BCAST7,Chi_21)
VMADDSUBRDUP(0,%0,BCAST8,Chi_22) VMADDSUBRDUP(0,%0,BCAST9,Chi_30)
VMADDSUBRDUP(0,%0,BCAST10,Chi_31) VMADDSUBRDUP(0,%0,BCAST11,Chi_32)
: : "r" (a0), "r" (a1), "r" (a2) );
VMADDSUBRDUP(0,%r8,BCAST0,Chi_00) VMADDSUBRDUP(0,%r8,BCAST1,Chi_01)
VMADDSUBRDUP(0,%r8,BCAST2,Chi_02) VMADDSUBRDUP(0,%r8,BCAST3,Chi_10)
VMADDSUBRDUP(0,%r8,BCAST4,Chi_11) VMADDSUBRDUP(0,%r8,BCAST5,Chi_12)
VMADDSUBRDUP(0,%r9,BCAST6,Chi_20) VMADDSUBRDUP(0,%r9,BCAST7,Chi_21)
VMADDSUBRDUP(0,%r9,BCAST8,Chi_22) VMADDSUBRDUP(0,%r9,BCAST9,Chi_30)
VMADDSUBRDUP(0,%r9,BCAST10,Chi_31) VMADDSUBRDUP(0,%r9,BCAST11,Chi_32)
);
}
a0 = a0+incr;
a1 = a1+incr;
@ -652,6 +682,32 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
}
}
}
#undef Chi_00
#undef Chi_01
#undef Chi_02
#undef Chi_10
#undef Chi_11
#undef Chi_12
#undef Chi_20
#undef Chi_21
#undef Chi_22
#undef Chi_30
#undef Chi_31
#undef Chi_32
#undef BCAST0
#undef BCAST1
#undef BCAST2
#undef BCAST3
#undef BCAST4
#undef BCAST5
#undef BCAST6
#undef BCAST7
#undef BCAST8
#undef BCAST9
#undef BCAST10
#undef BCAST11
#endif
};