Improvements to the assembler interface that let us move chunks of the

site and s loop into the kernels. This will save on function call overhead and guarantee L2 prefetching strategy is right since OMP can't distribute the sub-chunks of work.
2025-07-29 10:47:07 +01:00 · 2016-06-09 01:12:36 -07:00
parent d9408893b3
commit 55f65b81b5
10 changed files with 77 additions and 87 deletions
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -631,6 +631,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
           VMADDSUB(Z1,T2,UChi_11)				\
           VMADDSUB(Z2,T1,UChi_02)        VMOVIDUP(5,%r8,Z1 )	\
           VMADDSUB(Z2,T2,UChi_12)        VMOVIDUP(8,%r8,Z2 )	\
+	   VPREFETCH2(12,%r9)					   \
+	   VPREFETCH2(13,%r9)					   \
+	   VPREFETCH2(14,%r9)					   \
+	   VPREFETCH2(15,%r9)					   \
+	   VPREFETCH2(16,%r9)					   \
+	   VPREFETCH2(17,%r9)					   \
+	   VPREFETCH2(18,%r9)					   \
+	   VPREFETCH2(19,%r9)					   \
+	   VPREFETCH2(20,%r9)					   \
+	   VPREFETCH2(21,%r9)					   \
+	   VPREFETCH2(22,%r9)					   \
+	   VPREFETCH2(23,%r9)					   \
           /*38*/						\
           VMADDSUB(Z3,Chi_01,UChi_00)    VSHUF(Chi_02,T1)	\
           VMADDSUB(Z3,Chi_11,UChi_10)				\