1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-17 07:17:06 +01:00

Enable reordering of the loops in the assembler for cache friendly.

This gets in the way of L2 prefetching however. Do next next link in stencil
prefetching.
This commit is contained in:
paboyle
2016-06-19 11:45:58 -07:00
parent d6737e4bd8
commit 6d58cb2a68
15 changed files with 670 additions and 116 deletions

View File

@ -0,0 +1,163 @@
{
int locala,perma, ptypea;
int localb,permb, ptypeb;
uint64_t basea, baseb;
uint64_t basex;
const uint64_t plocal =(uint64_t) & in._odata[0];
// vComplexF isigns[2] = { signs[0], signs[1] };
vComplexF *isigns = &signs[0];
MASK_REGS;
for(int site=0;site<Ns;site++) {
int sU=lo.Reorder(ssU);
for(int s=0;s<Ls;s++) {
ss=sU*Ls+s;
////////////////////////////////
// Xp
////////////////////////////////
int ent=ss*8;// 2*Ndim
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
basex = basea;
if ( locala ) {
LOAD64(%r10,isigns);
XM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR3,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFXP(Xp,baseb);
}
LOAD64(%r10,isigns);
XM_RECON;
////////////////////////////////
// Yp
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR2,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFYP(Yp,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_RECON_ACCUM;
////////////////////////////////
// Zp
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR1,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFZP(Zp,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_RECON_ACCUM;
////////////////////////////////
// Tp
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR0,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFTP(Tp,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_RECON_ACCUM;
////////////////////////////////
// Xm
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR3,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFXM(Xm,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_RECON_ACCUM;
////////////////////////////////
// Ym
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR2,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFYM(Ym,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_RECON_ACCUM;
////////////////////////////////
// Zm
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR1,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFZM(Zm,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_RECON_ACCUM;
////////////////////////////////
// Tm
////////////////////////////////
basea = (uint64_t)&out._odata[ss];
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR0,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFTM(Tm,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_RECON_ACCUM;
SAVE_RESULT(&out._odata[ss]);
}
ssU++;
}
}