1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-12 20:27:06 +01:00

Improvements to the assembler interface that let us move chunks of the

site and s loop into the kernels. This will save on function call overhead and
guarantee L2 prefetching strategy is right since OMP can't distribute the
sub-chunks of work.
This commit is contained in:
paboyle
2016-06-09 01:12:36 -07:00
parent d9408893b3
commit 55f65b81b5
10 changed files with 77 additions and 87 deletions

View File

@ -631,6 +631,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VMADDSUB(Z1,T2,UChi_11) \
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
VPREFETCH2(12,%r9) \
VPREFETCH2(13,%r9) \
VPREFETCH2(14,%r9) \
VPREFETCH2(15,%r9) \
VPREFETCH2(16,%r9) \
VPREFETCH2(17,%r9) \
VPREFETCH2(18,%r9) \
VPREFETCH2(19,%r9) \
VPREFETCH2(20,%r9) \
VPREFETCH2(21,%r9) \
VPREFETCH2(22,%r9) \
VPREFETCH2(23,%r9) \
/*38*/ \
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
VMADDSUB(Z3,Chi_11,UChi_10) \