mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Compiles and passes under ZMobius with assembler
This commit is contained in:
parent
0091b50f49
commit
fe187e9ed3
@ -336,7 +336,6 @@ PARALLEL_FOR_LOOP
|
||||
#include <simd/Intel512avx.h>
|
||||
#include <simd/Intel512single.h>
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
|
||||
int LLs, int site,
|
||||
@ -482,6 +481,31 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie
|
||||
}
|
||||
}
|
||||
}
|
||||
#undef Chi_00
|
||||
#undef Chi_01
|
||||
#undef Chi_02
|
||||
#undef Chi_10
|
||||
#undef Chi_11
|
||||
#undef Chi_12
|
||||
#undef Chi_20
|
||||
#undef Chi_21
|
||||
#undef Chi_22
|
||||
#undef Chi_30
|
||||
#undef Chi_31
|
||||
#undef Chi_32
|
||||
|
||||
#undef BCAST0
|
||||
#undef BCAST1
|
||||
#undef BCAST2
|
||||
#undef BCAST3
|
||||
#undef BCAST4
|
||||
#undef BCAST5
|
||||
#undef BCAST6
|
||||
#undef BCAST7
|
||||
#undef BCAST8
|
||||
#undef BCAST9
|
||||
#undef BCAST10
|
||||
#undef BCAST11
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -541,31 +565,31 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
|
||||
{
|
||||
// pointers
|
||||
// MASK_REGS;
|
||||
#define Chi_00 %%zmm0
|
||||
#define Chi_01 %%zmm1
|
||||
#define Chi_02 %%zmm2
|
||||
#define Chi_10 %%zmm3
|
||||
#define Chi_11 %%zmm4
|
||||
#define Chi_12 %%zmm5
|
||||
#define Chi_20 %%zmm6
|
||||
#define Chi_21 %%zmm7
|
||||
#define Chi_22 %%zmm8
|
||||
#define Chi_30 %%zmm9
|
||||
#define Chi_31 %%zmm10
|
||||
#define Chi_32 %%zmm11
|
||||
#define Chi_00 %zmm0
|
||||
#define Chi_01 %zmm1
|
||||
#define Chi_02 %zmm2
|
||||
#define Chi_10 %zmm3
|
||||
#define Chi_11 %zmm4
|
||||
#define Chi_12 %zmm5
|
||||
#define Chi_20 %zmm6
|
||||
#define Chi_21 %zmm7
|
||||
#define Chi_22 %zmm8
|
||||
#define Chi_30 %zmm9
|
||||
#define Chi_31 %zmm10
|
||||
#define Chi_32 %zmm11
|
||||
|
||||
#define BCAST0 %%zmm12
|
||||
#define BCAST1 %%zmm13
|
||||
#define BCAST2 %%zmm14
|
||||
#define BCAST3 %%zmm15
|
||||
#define BCAST4 %%zmm16
|
||||
#define BCAST5 %%zmm17
|
||||
#define BCAST6 %%zmm18
|
||||
#define BCAST7 %%zmm19
|
||||
#define BCAST8 %%zmm20
|
||||
#define BCAST9 %%zmm21
|
||||
#define BCAST10 %%zmm22
|
||||
#define BCAST11 %%zmm23
|
||||
#define BCAST0 %zmm12
|
||||
#define BCAST1 %zmm13
|
||||
#define BCAST2 %zmm14
|
||||
#define BCAST3 %zmm15
|
||||
#define BCAST4 %zmm16
|
||||
#define BCAST5 %zmm17
|
||||
#define BCAST6 %zmm18
|
||||
#define BCAST7 %zmm19
|
||||
#define BCAST8 %zmm20
|
||||
#define BCAST9 %zmm21
|
||||
#define BCAST10 %zmm22
|
||||
#define BCAST11 %zmm23
|
||||
|
||||
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
@ -576,65 +600,71 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
|
||||
uint64_t a2 = (uint64_t)&psi[lex];
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
if ( (s2+l)==0 ) {
|
||||
LOAD64(%r8,a0);
|
||||
LOAD64(%r9,a1);
|
||||
LOAD64(%r10,a2);
|
||||
asm (
|
||||
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
|
||||
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
|
||||
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
|
||||
VBCASTCDUP(0,%2,BCAST0) VBCASTCDUP(1,%2,BCAST1)
|
||||
VBCASTCDUP(2,%2,BCAST2) VBCASTCDUP(3,%2,BCAST3)
|
||||
VBCASTCDUP(4,%2,BCAST4) VBCASTCDUP(5,%2,BCAST5)
|
||||
VBCASTCDUP(6,%2,BCAST6) VBCASTCDUP(7,%2,BCAST7)
|
||||
VBCASTCDUP(8,%2,BCAST8) VBCASTCDUP(9,%2,BCAST9)
|
||||
VBCASTCDUP(10,%2,BCAST10) VBCASTCDUP(11,%2,BCAST11)
|
||||
VMULIDUP (0,%0,BCAST0,Chi_00) VMULIDUP(0,%0,BCAST1,Chi_01) // II RI from Mat / Psi
|
||||
VMULIDUP (0,%0,BCAST2,Chi_02) VMULIDUP(0,%0,BCAST3,Chi_10)
|
||||
VMULIDUP (0,%0,BCAST4,Chi_11) VMULIDUP(0,%0,BCAST5,Chi_12)
|
||||
VMULIDUP (0,%0,BCAST6,Chi_20) VMULIDUP(0,%0,BCAST7,Chi_21)
|
||||
VMULIDUP (0,%0,BCAST8,Chi_22) VMULIDUP(0,%0,BCAST9,Chi_30)
|
||||
VMULIDUP (0,%0,BCAST10,Chi_31) VMULIDUP(0,%0,BCAST11,Chi_32)
|
||||
VPREFETCH1(0,%r10) VPREFETCH1(0,%r9)
|
||||
VPREFETCH1(12,%r10) VPREFETCH1(13,%r10)
|
||||
VPREFETCH1(14,%r10) VPREFETCH1(15,%r10)
|
||||
VBCASTCDUP(0,%r10,BCAST0) VBCASTCDUP(1,%r10,BCAST1)
|
||||
VBCASTCDUP(2,%r10,BCAST2) VBCASTCDUP(3,%r10,BCAST3)
|
||||
VBCASTCDUP(4,%r10,BCAST4) VBCASTCDUP(5,%r10,BCAST5)
|
||||
VBCASTCDUP(6,%r10,BCAST6) VBCASTCDUP(7,%r10,BCAST7)
|
||||
VBCASTCDUP(8,%r10,BCAST8) VBCASTCDUP(9,%r10,BCAST9)
|
||||
VBCASTCDUP(10,%r10,BCAST10) VBCASTCDUP(11,%r10,BCAST11)
|
||||
VMULIDUP (0,%r8,BCAST0,Chi_00) VMULIDUP(0,%r8,BCAST1,Chi_01) // II RI from Mat / Psi
|
||||
VMULIDUP (0,%r8,BCAST2,Chi_02) VMULIDUP(0,%r8,BCAST3,Chi_10)
|
||||
VMULIDUP (0,%r8,BCAST4,Chi_11) VMULIDUP(0,%r8,BCAST5,Chi_12)
|
||||
VMULIDUP (0,%r9,BCAST6,Chi_20) VMULIDUP(0,%r9,BCAST7,Chi_21)
|
||||
VMULIDUP (0,%r9,BCAST8,Chi_22) VMULIDUP(0,%r9,BCAST9,Chi_30)
|
||||
VMULIDUP (0,%r9,BCAST10,Chi_31) VMULIDUP(0,%r9,BCAST11,Chi_32)
|
||||
VSHUF(BCAST0,BCAST0) VSHUF(BCAST1,BCAST1)
|
||||
VSHUF(BCAST2,BCAST2) VSHUF(BCAST3,BCAST3)
|
||||
VSHUF(BCAST4,BCAST4) VSHUF(BCAST5,BCAST5)
|
||||
VSHUF(BCAST6,BCAST6) VSHUF(BCAST7,BCAST7)
|
||||
VSHUF(BCAST8,BCAST8) VSHUF(BCAST9,BCAST9)
|
||||
VSHUF(BCAST10,BCAST10) VSHUF(BCAST11,BCAST11)
|
||||
VMADDSUBRDUP(0,%0,BCAST0,Chi_00) VMADDSUBRDUP(0,%0,BCAST1,Chi_01)
|
||||
VMADDSUBRDUP(0,%0,BCAST2,Chi_02) VMADDSUBRDUP(0,%0,BCAST3,Chi_10)
|
||||
VMADDSUBRDUP(0,%0,BCAST4,Chi_11) VMADDSUBRDUP(0,%0,BCAST5,Chi_12)
|
||||
VMADDSUBRDUP(0,%0,BCAST6,Chi_20) VMADDSUBRDUP(0,%0,BCAST7,Chi_21)
|
||||
VMADDSUBRDUP(0,%0,BCAST8,Chi_22) VMADDSUBRDUP(0,%0,BCAST9,Chi_30)
|
||||
VMADDSUBRDUP(0,%0,BCAST10,Chi_31) VMADDSUBRDUP(0,%0,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
VMADDSUBRDUP(0,%r8,BCAST0,Chi_00) VMADDSUBRDUP(0,%r8,BCAST1,Chi_01)
|
||||
VMADDSUBRDUP(0,%r8,BCAST2,Chi_02) VMADDSUBRDUP(0,%r8,BCAST3,Chi_10)
|
||||
VMADDSUBRDUP(0,%r8,BCAST4,Chi_11) VMADDSUBRDUP(0,%r8,BCAST5,Chi_12)
|
||||
VMADDSUBRDUP(0,%r9,BCAST6,Chi_20) VMADDSUBRDUP(0,%r9,BCAST7,Chi_21)
|
||||
VMADDSUBRDUP(0,%r9,BCAST8,Chi_22) VMADDSUBRDUP(0,%r9,BCAST9,Chi_30)
|
||||
VMADDSUBRDUP(0,%r9,BCAST10,Chi_31) VMADDSUBRDUP(0,%r9,BCAST11,Chi_32) );
|
||||
|
||||
} else {
|
||||
LOAD64(%r8,a0);
|
||||
LOAD64(%r9,a1);
|
||||
LOAD64(%r10,a2);
|
||||
asm (
|
||||
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
|
||||
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
|
||||
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
|
||||
VBCASTCDUP(0,%2,BCAST0) VBCASTCDUP(1,%2,BCAST1)
|
||||
VBCASTCDUP(2,%2,BCAST2) VBCASTCDUP(3,%2,BCAST3)
|
||||
VBCASTCDUP(4,%2,BCAST4) VBCASTCDUP(5,%2,BCAST5)
|
||||
VBCASTCDUP(6,%2,BCAST6) VBCASTCDUP(7,%2,BCAST7)
|
||||
VBCASTCDUP(8,%2,BCAST8) VBCASTCDUP(9,%2,BCAST9)
|
||||
VBCASTCDUP(10,%2,BCAST10) VBCASTCDUP(11,%2,BCAST11)
|
||||
VMADDSUBIDUP (0,%0,BCAST0,Chi_00) VMADDSUBIDUP(0,%0,BCAST1,Chi_01) // II RI from Mat / Psi
|
||||
VMADDSUBIDUP (0,%0,BCAST2,Chi_02) VMADDSUBIDUP(0,%0,BCAST3,Chi_10)
|
||||
VMADDSUBIDUP (0,%0,BCAST4,Chi_11) VMADDSUBIDUP(0,%0,BCAST5,Chi_12)
|
||||
VMADDSUBIDUP (0,%0,BCAST6,Chi_20) VMADDSUBIDUP(0,%0,BCAST7,Chi_21)
|
||||
VMADDSUBIDUP (0,%0,BCAST8,Chi_22) VMADDSUBIDUP(0,%0,BCAST9,Chi_30)
|
||||
VMADDSUBIDUP (0,%0,BCAST10,Chi_31) VMADDSUBIDUP(0,%0,BCAST11,Chi_32)
|
||||
VPREFETCH1(0,%r10) VPREFETCH1(0,%r9)
|
||||
VPREFETCH1(12,%r10) VPREFETCH1(13,%r10)
|
||||
VPREFETCH1(14,%r10) VPREFETCH1(15,%r10)
|
||||
VBCASTCDUP(0,%r10,BCAST0) VBCASTCDUP(1,%r10,BCAST1)
|
||||
VBCASTCDUP(2,%r10,BCAST2) VBCASTCDUP(3,%r10,BCAST3)
|
||||
VBCASTCDUP(4,%r10,BCAST4) VBCASTCDUP(5,%r10,BCAST5)
|
||||
VBCASTCDUP(6,%r10,BCAST6) VBCASTCDUP(7,%r10,BCAST7)
|
||||
VBCASTCDUP(8,%r10,BCAST8) VBCASTCDUP(9,%r10,BCAST9)
|
||||
VBCASTCDUP(10,%r10,BCAST10) VBCASTCDUP(11,%r10,BCAST11)
|
||||
VMADDSUBIDUP (0,%r8,BCAST0,Chi_00) VMADDSUBIDUP(0,%r8,BCAST1,Chi_01) // II RI from Mat / Psi
|
||||
VMADDSUBIDUP (0,%r8,BCAST2,Chi_02) VMADDSUBIDUP(0,%r8,BCAST3,Chi_10)
|
||||
VMADDSUBIDUP (0,%r8,BCAST4,Chi_11) VMADDSUBIDUP(0,%r8,BCAST5,Chi_12)
|
||||
VMADDSUBIDUP (0,%r9,BCAST6,Chi_20) VMADDSUBIDUP(0,%r9,BCAST7,Chi_21)
|
||||
VMADDSUBIDUP (0,%r9,BCAST8,Chi_22) VMADDSUBIDUP(0,%r9,BCAST9,Chi_30)
|
||||
VMADDSUBIDUP (0,%r9,BCAST10,Chi_31) VMADDSUBIDUP(0,%r9,BCAST11,Chi_32)
|
||||
VSHUF(BCAST0,BCAST0) VSHUF(BCAST1,BCAST1)
|
||||
VSHUF(BCAST2,BCAST2) VSHUF(BCAST3,BCAST3)
|
||||
VSHUF(BCAST4,BCAST4) VSHUF(BCAST5,BCAST5)
|
||||
VSHUF(BCAST6,BCAST6) VSHUF(BCAST7,BCAST7)
|
||||
VSHUF(BCAST8,BCAST8) VSHUF(BCAST9,BCAST9)
|
||||
VSHUF(BCAST10,BCAST10) VSHUF(BCAST11,BCAST11)
|
||||
VMADDSUBRDUP(0,%0,BCAST0,Chi_00) VMADDSUBRDUP(0,%0,BCAST1,Chi_01)
|
||||
VMADDSUBRDUP(0,%0,BCAST2,Chi_02) VMADDSUBRDUP(0,%0,BCAST3,Chi_10)
|
||||
VMADDSUBRDUP(0,%0,BCAST4,Chi_11) VMADDSUBRDUP(0,%0,BCAST5,Chi_12)
|
||||
VMADDSUBRDUP(0,%0,BCAST6,Chi_20) VMADDSUBRDUP(0,%0,BCAST7,Chi_21)
|
||||
VMADDSUBRDUP(0,%0,BCAST8,Chi_22) VMADDSUBRDUP(0,%0,BCAST9,Chi_30)
|
||||
VMADDSUBRDUP(0,%0,BCAST10,Chi_31) VMADDSUBRDUP(0,%0,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
VMADDSUBRDUP(0,%r8,BCAST0,Chi_00) VMADDSUBRDUP(0,%r8,BCAST1,Chi_01)
|
||||
VMADDSUBRDUP(0,%r8,BCAST2,Chi_02) VMADDSUBRDUP(0,%r8,BCAST3,Chi_10)
|
||||
VMADDSUBRDUP(0,%r8,BCAST4,Chi_11) VMADDSUBRDUP(0,%r8,BCAST5,Chi_12)
|
||||
VMADDSUBRDUP(0,%r9,BCAST6,Chi_20) VMADDSUBRDUP(0,%r9,BCAST7,Chi_21)
|
||||
VMADDSUBRDUP(0,%r9,BCAST8,Chi_22) VMADDSUBRDUP(0,%r9,BCAST9,Chi_30)
|
||||
VMADDSUBRDUP(0,%r9,BCAST10,Chi_31) VMADDSUBRDUP(0,%r9,BCAST11,Chi_32)
|
||||
);
|
||||
}
|
||||
a0 = a0+incr;
|
||||
a1 = a1+incr;
|
||||
@ -652,6 +682,32 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
|
||||
}
|
||||
}
|
||||
}
|
||||
#undef Chi_00
|
||||
#undef Chi_01
|
||||
#undef Chi_02
|
||||
#undef Chi_10
|
||||
#undef Chi_11
|
||||
#undef Chi_12
|
||||
#undef Chi_20
|
||||
#undef Chi_21
|
||||
#undef Chi_22
|
||||
#undef Chi_30
|
||||
#undef Chi_31
|
||||
#undef Chi_32
|
||||
|
||||
#undef BCAST0
|
||||
#undef BCAST1
|
||||
#undef BCAST2
|
||||
#undef BCAST3
|
||||
#undef BCAST4
|
||||
#undef BCAST5
|
||||
#undef BCAST6
|
||||
#undef BCAST7
|
||||
#undef BCAST8
|
||||
#undef BCAST9
|
||||
#undef BCAST10
|
||||
#undef BCAST11
|
||||
|
||||
#endif
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user