mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Partial implementation of 4d vectorisation assembler
This commit is contained in:
parent
d4071daf2a
commit
b3e7f600da
@ -29,7 +29,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid.h>
|
#include <Grid.h>
|
||||||
#include <simd/Intel512common.h>
|
#include <simd/Intel512common.h>
|
||||||
#include <simd/Intel512avx.h>
|
#include <simd/Intel512avx.h>
|
||||||
#include <simd/Intel512single.h>
|
|
||||||
|
|
||||||
// Interleave operations from two directions
|
// Interleave operations from two directions
|
||||||
// This looks just like a 2 spin multiply and reuse same sequence from the Wilson
|
// This looks just like a 2 spin multiply and reuse same sequence from the Wilson
|
||||||
@ -91,6 +90,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define T2 %zmm26
|
#define T2 %zmm26
|
||||||
#define T3 %zmm27
|
#define T3 %zmm27
|
||||||
|
|
||||||
|
#define Z00 %zmm28
|
||||||
|
#define Z10 %zmm29
|
||||||
|
#define Z1 %zmm30
|
||||||
|
#define Z2 %zmm31
|
||||||
|
|
||||||
|
#define Z3 Chi_22
|
||||||
|
#define Z4 Chi_30
|
||||||
|
#define Z5 Chi_31
|
||||||
|
#define Z6 Chi_32
|
||||||
|
|
||||||
#define MULT_ADD_LS(g0,g1,g2,g3) \
|
#define MULT_ADD_LS(g0,g1,g2,g3) \
|
||||||
asm ( "movq %0, %%r8 \n\t" \
|
asm ( "movq %0, %%r8 \n\t" \
|
||||||
"movq %1, %%r9 \n\t" \
|
"movq %1, %%r9 \n\t" \
|
||||||
@ -189,37 +198,136 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
|
VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
|
||||||
VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
|
VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
|
||||||
|
|
||||||
#define LOAD_CHI(a0,a1,a2,a3) \
|
|
||||||
|
#define MULT_ADD_XYZT(g0,g1) \
|
||||||
|
asm ( "movq %0, %%r8 \n\t" \
|
||||||
|
"movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9");\
|
||||||
|
__asm__ ( \
|
||||||
|
VSHUFMEM(0,%r8,Z00) VSHUFMEM(0,%r9,Z10) \
|
||||||
|
VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \
|
||||||
|
VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \
|
||||||
|
VMUL(Z00,Chi_00,Z1) VMUL(Z10,Chi_10,Z2) \
|
||||||
|
VSHUFMEM(3,%r8,Z00) VSHUFMEM(3,%r9,Z10) \
|
||||||
|
VMUL(Z00,Chi_00,Z3) VMUL(Z10,Chi_10,Z4) \
|
||||||
|
VSHUFMEM(6,%r8,Z00) VSHUFMEM(6,%r9,Z10) \
|
||||||
|
VMUL(Z00,Chi_00,Z5) VMUL(Z10,Chi_10,Z6) \
|
||||||
|
VMADDMEM(0,%r8,T1,UChi_00) VMADDMEM(0,%r8,T2,UChi_10) \
|
||||||
|
VMADDMEM(3,%r8,T1,UChi_01) VMADDMEM(3,%r8,T2,UChi_11) \
|
||||||
|
VMADDMEM(6,%r8,T1,UChi_02) VMADDMEM(6,%r8,T2,UChi_12) \
|
||||||
|
VSHUFMEM(1,%r8,Z00) VSHUFMEM(1,%r9,Z10) \
|
||||||
|
VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \
|
||||||
|
VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \
|
||||||
|
VMADD(Z00,Chi_01,Z1) VMADD(Z10,Chi_11,Z2) \
|
||||||
|
VSHUFMEM(4,%r8,Z00) VSHUFMEM(4,%r9,Z10) \
|
||||||
|
VMADD(Z00,Chi_01,Z3) VMADD(Z10,Chi_11,Z4) \
|
||||||
|
VSHUFMEM(7,%r8,Z00) VSHUFMEM(7,%r9,Z10) \
|
||||||
|
VMADD(Z00,Chi_01,Z5) VMADD(Z10,Chi_11,Z6) \
|
||||||
|
VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \
|
||||||
|
VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \
|
||||||
|
VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \
|
||||||
|
VSHUFMEM(2,%r8,Z00) VSHUFMEM(2,%r9,Z10) \
|
||||||
|
VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \
|
||||||
|
VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \
|
||||||
|
VMADD(Z00,Chi_02,Z1) VMADD(Z10,Chi_12,Z2) \
|
||||||
|
VSHUFMEM(5,%r8,Z00) VSHUFMEM(5,%r9,Z10) \
|
||||||
|
VMADD(Z00,Chi_02,Z3) VMADD(Z10,Chi_12,Z4) \
|
||||||
|
VSHUFMEM(8,%r8,Z00) VSHUFMEM(8,%r9,Z10) \
|
||||||
|
VMADD(Z00,Chi_02,Z5) VMADD(Z10,Chi_12,Z6) \
|
||||||
|
VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \
|
||||||
|
VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \
|
||||||
|
VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \
|
||||||
|
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
|
||||||
|
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
|
||||||
|
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) );
|
||||||
|
|
||||||
|
|
||||||
|
#define MULT_XYZT(g0,g1) \
|
||||||
|
asm ( "movq %0, %%r8 \n\t" \
|
||||||
|
"movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9" ); \
|
||||||
|
__asm__ ( \
|
||||||
|
VSHUFMEM(0,%r8,Z00) VSHUFMEM(0,%r9,Z10) \
|
||||||
|
VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \
|
||||||
|
VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \
|
||||||
|
VMUL(Z00,Chi_00,Z1) VMUL(Z10,Chi_10,Z2) \
|
||||||
|
VSHUFMEM(3,%r8,Z00) VSHUFMEM(3,%r9,Z10) \
|
||||||
|
VMUL(Z00,Chi_00,Z3) VMUL(Z10,Chi_10,Z4) \
|
||||||
|
VSHUFMEM(6,%r8,Z00) VSHUFMEM(6,%r9,Z10) \
|
||||||
|
VMUL(Z00,Chi_00,Z5) VMUL(Z10,Chi_10,Z6) \
|
||||||
|
VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \
|
||||||
|
VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \
|
||||||
|
VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \
|
||||||
|
VSHUFMEM(1,%r8,Z00) VSHUFMEM(1,%r9,Z10) \
|
||||||
|
VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \
|
||||||
|
VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \
|
||||||
|
VMADD(Z00,Chi_01,Z1) VMADD(Z10,Chi_11,Z2) \
|
||||||
|
VSHUFMEM(4,%r8,Z00) VSHUFMEM(4,%r9,Z10) \
|
||||||
|
VMADD(Z00,Chi_01,Z3) VMADD(Z10,Chi_11,Z4) \
|
||||||
|
VSHUFMEM(7,%r8,Z00) VSHUFMEM(7,%r9,Z10) \
|
||||||
|
VMADD(Z00,Chi_01,Z5) VMADD(Z10,Chi_11,Z6) \
|
||||||
|
VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \
|
||||||
|
VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \
|
||||||
|
VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \
|
||||||
|
VSHUFMEM(2,%r8,Z00) VSHUFMEM(2,%r9,Z10) \
|
||||||
|
VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \
|
||||||
|
VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \
|
||||||
|
VMADD(Z00,Chi_02,Z1) VMADD(Z10,Chi_12,Z2) \
|
||||||
|
VSHUFMEM(5,%r8,Z00) VSHUFMEM(5,%r9,Z10) \
|
||||||
|
VMADD(Z00,Chi_02,Z3) VMADD(Z10,Chi_12,Z4) \
|
||||||
|
VSHUFMEM(8,%r8,Z00) VSHUFMEM(8,%r9,Z10) \
|
||||||
|
VMADD(Z00,Chi_02,Z5) VMADD(Z10,Chi_12,Z6) \
|
||||||
|
VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \
|
||||||
|
VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \
|
||||||
|
VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \
|
||||||
|
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
|
||||||
|
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
|
||||||
|
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) );
|
||||||
|
|
||||||
|
|
||||||
|
#define LOAD_CHI(a0,a1,a2,a3) \
|
||||||
asm ( \
|
asm ( \
|
||||||
"movq %0, %%r8 \n\t" \
|
"movq %0, %%r8 \n\t" \
|
||||||
VLOAD(0,%%r8,pChi_00) \
|
VLOAD(0,%%r8,pChi_00) \
|
||||||
VLOAD(1,%%r8,pChi_01) \
|
VLOAD(1,%%r8,pChi_01) \
|
||||||
VLOAD(2,%%r8,pChi_02) \
|
VLOAD(2,%%r8,pChi_02) \
|
||||||
: : "r" (a0) : "%r8" ); \
|
: : "r" (a0) : "%r8" ); \
|
||||||
asm ( \
|
asm ( \
|
||||||
"movq %0, %%r8 \n\t" \
|
"movq %0, %%r8 \n\t" \
|
||||||
VLOAD(0,%%r8,pChi_10) \
|
VLOAD(0,%%r8,pChi_10) \
|
||||||
VLOAD(1,%%r8,pChi_11) \
|
VLOAD(1,%%r8,pChi_11) \
|
||||||
VLOAD(2,%%r8,pChi_12) \
|
VLOAD(2,%%r8,pChi_12) \
|
||||||
: : "r" (a1) : "%r8" ); \
|
: : "r" (a1) : "%r8" ); \
|
||||||
asm ( \
|
asm ( \
|
||||||
"movq %0, %%r8 \n\t" \
|
"movq %0, %%r8 \n\t" \
|
||||||
VLOAD(0,%%r8,pChi_20) \
|
VLOAD(0,%%r8,pChi_20) \
|
||||||
VLOAD(1,%%r8,pChi_21) \
|
VLOAD(1,%%r8,pChi_21) \
|
||||||
VLOAD(2,%%r8,pChi_22) \
|
VLOAD(2,%%r8,pChi_22) \
|
||||||
: : "r" (a2) : "%r8" ); \
|
: : "r" (a2) : "%r8" ); \
|
||||||
asm ( \
|
asm ( \
|
||||||
"movq %0, %%r8 \n\t" \
|
"movq %0, %%r8 \n\t" \
|
||||||
VLOAD(0,%%r8,pChi_30) \
|
VLOAD(0,%%r8,pChi_30) \
|
||||||
VLOAD(1,%%r8,pChi_31) \
|
VLOAD(1,%%r8,pChi_31) \
|
||||||
VLOAD(2,%%r8,pChi_32) \
|
VLOAD(2,%%r8,pChi_32) \
|
||||||
: : "r" (a3) : "%r8" );
|
: : "r" (a3) : "%r8" );
|
||||||
|
|
||||||
#define PF_CHI(a0) \
|
#define LOAD_CHIa(a0,a1) \
|
||||||
asm ( \
|
asm ( \
|
||||||
"movq %0, %%r8 \n\t" \
|
"movq %0, %%r8 \n\t" \
|
||||||
VPREFETCH1(0,%%r8) \
|
VLOAD(0,%%r8,pChi_00) \
|
||||||
VPREFETCH1(1,%%r8) \
|
VLOAD(1,%%r8,pChi_01) \
|
||||||
|
VLOAD(2,%%r8,pChi_02) \
|
||||||
|
: : "r" (a0) : "%r8" ); \
|
||||||
|
asm ( \
|
||||||
|
"movq %0, %%r8 \n\t" \
|
||||||
|
VLOAD(0,%%r8,pChi_10) \
|
||||||
|
VLOAD(1,%%r8,pChi_11) \
|
||||||
|
VLOAD(2,%%r8,pChi_12) \
|
||||||
|
: : "r" (a1) : "%r8" );
|
||||||
|
|
||||||
|
#define PF_CHI(a0) \
|
||||||
|
asm ( \
|
||||||
|
"movq %0, %%r8 \n\t" \
|
||||||
|
VPREFETCH1(0,%%r8) \
|
||||||
|
VPREFETCH1(1,%%r8) \
|
||||||
VPREFETCH1(2,%%r8) \
|
VPREFETCH1(2,%%r8) \
|
||||||
: : "r" (a0) : "%r8" ); \
|
: : "r" (a0) : "%r8" ); \
|
||||||
|
|
||||||
@ -235,6 +343,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VADD(UChi_00,UChi_30,UChi_00) \
|
VADD(UChi_00,UChi_30,UChi_00) \
|
||||||
VADD(UChi_01,UChi_31,UChi_01) \
|
VADD(UChi_01,UChi_31,UChi_01) \
|
||||||
VADD(UChi_02,UChi_32,UChi_02) ); \
|
VADD(UChi_02,UChi_32,UChi_02) ); \
|
||||||
|
asm ( \
|
||||||
|
VSTORE(0,%0,pUChi_00) \
|
||||||
|
VSTORE(1,%0,pUChi_01) \
|
||||||
|
VSTORE(2,%0,pUChi_02) \
|
||||||
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
|
#define REDUCEa(out) \
|
||||||
|
asm ( \
|
||||||
|
VADD(UChi_00,UChi_10,UChi_00) \
|
||||||
|
VADD(UChi_01,UChi_11,UChi_01) \
|
||||||
|
VADD(UChi_02,UChi_12,UChi_02) ); \
|
||||||
asm ( \
|
asm ( \
|
||||||
VSTORE(0,%0,pUChi_00) \
|
VSTORE(0,%0,pUChi_00) \
|
||||||
VSTORE(1,%0,pUChi_01) \
|
VSTORE(1,%0,pUChi_01) \
|
||||||
@ -259,27 +378,7 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
assert(0);
|
assert(0);
|
||||||
|
|
||||||
}
|
}
|
||||||
// This is the single precision 5th direction vectorised kernel
|
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
DoubledGaugeField &UUU,
|
|
||||||
SiteSpinor *buf, int sF,
|
|
||||||
int sU, const FermionField &in, SiteSpinor &out)
|
|
||||||
{
|
|
||||||
#ifdef AVX512
|
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
|
||||||
uint64_t addr0,addr1,addr2,addr3;
|
|
||||||
|
|
||||||
int o0,o1,o2,o3; // offsets
|
|
||||||
int l0,l1,l2,l3; // local
|
|
||||||
int p0,p1,p2,p3; // perm
|
|
||||||
int ptype;
|
|
||||||
StencilEntry *SE0;
|
|
||||||
StencilEntry *SE1;
|
|
||||||
StencilEntry *SE2;
|
|
||||||
StencilEntry *SE3;
|
|
||||||
|
|
||||||
// Xp, Yp, Zp, Tp
|
|
||||||
#define PREPARE(X,Y,Z,T,skew,UU) \
|
#define PREPARE(X,Y,Z,T,skew,UU) \
|
||||||
SE0=st.GetEntry(ptype,X+skew,sF); \
|
SE0=st.GetEntry(ptype,X+skew,sF); \
|
||||||
o0 = SE0->_offset; \
|
o0 = SE0->_offset; \
|
||||||
@ -310,6 +409,29 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
gauge2 =(uint64_t)&UU._odata[sU]( Z ); \
|
gauge2 =(uint64_t)&UU._odata[sU]( Z ); \
|
||||||
gauge3 =(uint64_t)&UU._odata[sU]( T );
|
gauge3 =(uint64_t)&UU._odata[sU]( T );
|
||||||
|
|
||||||
|
// This is the single precision 5th direction vectorised kernel
|
||||||
|
#include <simd/Intel512single.h>
|
||||||
|
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int sF,
|
||||||
|
int sU, const FermionField &in, SiteSpinor &out)
|
||||||
|
{
|
||||||
|
#ifdef AVX512
|
||||||
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
|
uint64_t addr0,addr1,addr2,addr3;
|
||||||
|
|
||||||
|
int o0,o1,o2,o3; // offsets
|
||||||
|
int l0,l1,l2,l3; // local
|
||||||
|
int p0,p1,p2,p3; // perm
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE0;
|
||||||
|
StencilEntry *SE1;
|
||||||
|
StencilEntry *SE2;
|
||||||
|
StencilEntry *SE3;
|
||||||
|
|
||||||
|
// Xp, Yp, Zp, Tp
|
||||||
|
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
MULT_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
@ -333,6 +455,108 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This is the single precision 5th direction vectorised kernel
|
||||||
|
#include <simd/Intel512double.h>
|
||||||
|
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int sF,
|
||||||
|
int sU, const FermionField &in, SiteSpinor &out)
|
||||||
|
{
|
||||||
|
#ifdef AVX512
|
||||||
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
|
uint64_t addr0,addr1,addr2,addr3;
|
||||||
|
|
||||||
|
int o0,o1,o2,o3; // offsets
|
||||||
|
int l0,l1,l2,l3; // local
|
||||||
|
int p0,p1,p2,p3; // perm
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE0;
|
||||||
|
StencilEntry *SE1;
|
||||||
|
StencilEntry *SE2;
|
||||||
|
StencilEntry *SE3;
|
||||||
|
|
||||||
|
// Xp, Yp, Zp, Tp
|
||||||
|
|
||||||
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
|
MULT_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
|
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
||||||
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
|
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
||||||
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
|
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
||||||
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
|
addr0 = (uint64_t) &out;
|
||||||
|
REDUCE(addr0);
|
||||||
|
#else
|
||||||
|
assert(0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is the single precision 5th direction vectorised kernel
|
||||||
|
#include <simd/Intel512single.h>
|
||||||
|
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int sF,
|
||||||
|
int sU, const FermionField &in, SiteSpinor &out)
|
||||||
|
{
|
||||||
|
#ifdef AVX512
|
||||||
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
|
uint64_t addr0,addr1,addr2,addr3;
|
||||||
|
|
||||||
|
int o0,o1,o2,o3; // offsets
|
||||||
|
int l0,l1,l2,l3; // local
|
||||||
|
int p0,p1,p2,p3; // perm
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE0;
|
||||||
|
StencilEntry *SE1;
|
||||||
|
StencilEntry *SE2;
|
||||||
|
StencilEntry *SE3;
|
||||||
|
|
||||||
|
// Xp, Yp, Zp, Tp
|
||||||
|
|
||||||
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
|
LOAD_CHIa(addr0,addr1);
|
||||||
|
MULT_XYZT(gauge0,gauge1);
|
||||||
|
LOAD_CHIa(addr2,addr3);
|
||||||
|
MULT_XYZT(gauge2,gauge3);
|
||||||
|
|
||||||
|
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
||||||
|
LOAD_CHIa(addr0,addr1);
|
||||||
|
MULT_ADD_XYZT(gauge0,gauge1);
|
||||||
|
LOAD_CHIa(addr2,addr3);
|
||||||
|
MULT_ADD_XYZT(gauge2,gauge3);
|
||||||
|
|
||||||
|
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
||||||
|
LOAD_CHIa(addr0,addr1);
|
||||||
|
MULT_ADD_XYZT(gauge0,gauge1);
|
||||||
|
LOAD_CHIa(addr2,addr3);
|
||||||
|
MULT_ADD_XYZT(gauge2,gauge3);
|
||||||
|
|
||||||
|
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
||||||
|
LOAD_CHIa(addr0,addr1);
|
||||||
|
MULT_ADD_XYZT(gauge0,gauge1);
|
||||||
|
LOAD_CHIa(addr2,addr3);
|
||||||
|
MULT_ADD_XYZT(gauge2,gauge3);
|
||||||
|
|
||||||
|
addr0 = (uint64_t) &out;
|
||||||
|
REDUCEa(addr0);
|
||||||
|
#else
|
||||||
|
assert(0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
FermOpStaggeredTemplateInstantiate(StaggeredKernels);
|
FermOpStaggeredTemplateInstantiate(StaggeredKernels);
|
||||||
FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
|
FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user