1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-18 07:47:06 +01:00

Enable reordering of the loops in the assembler for cache friendly.

This gets in the way of L2 prefetching however. Do next next link in stencil
prefetching.
This commit is contained in:
paboyle
2016-06-19 11:45:58 -07:00
parent 87418e7df1
commit 1b7f88dd00
15 changed files with 670 additions and 116 deletions

View File

@ -53,6 +53,8 @@ namespace QCD {
StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid),
UmuEven(&Hgrid),
UmuOdd (&Hgrid)
@ -228,7 +230,7 @@ PARALLEL_FOR_LOOP
out.checkerboard = in.checkerboard;
DhopInternal(Stencil,Umu,in,out,dag);
DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
}
template<class Impl>
@ -239,7 +241,7 @@ PARALLEL_FOR_LOOP
assert(in.checkerboard==Even);
out.checkerboard = Odd;
DhopInternal(StencilEven,UmuOdd,in,out,dag);
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
}
template<class Impl>
@ -250,7 +252,7 @@ PARALLEL_FOR_LOOP
assert(in.checkerboard==Odd);
out.checkerboard = Even;
DhopInternal(StencilOdd,UmuEven,in,out,dag);
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
}
template<class Impl>
@ -285,7 +287,7 @@ PARALLEL_FOR_LOOP
};
template<class Impl>
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
assert((dag==DaggerNo) ||(dag==DaggerYes));
@ -296,12 +298,12 @@ PARALLEL_FOR_LOOP
if ( dag == DaggerYes ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
}
}
};