mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Debugged 2 versions of assembler; ls vectorised, xyzt vectorised
This commit is contained in:
		@@ -90,13 +90,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#define T2 %zmm26
 | 
					#define T2 %zmm26
 | 
				
			||||||
#define T3 %zmm27
 | 
					#define T3 %zmm27
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define Z00 %zmm28
 | 
					#define Z00 %zmm26
 | 
				
			||||||
#define Z10 %zmm29
 | 
					#define Z10 %zmm27
 | 
				
			||||||
#define Z1 %zmm30
 | 
					#define Z0 Z00
 | 
				
			||||||
#define Z2 %zmm31
 | 
					#define Z1 %zmm28
 | 
				
			||||||
 | 
					#define Z2 %zmm29
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define Z3 Chi_22
 | 
					#define Z3 %zmm30
 | 
				
			||||||
#define Z4 Chi_30
 | 
					#define Z4 %zmm31
 | 
				
			||||||
#define Z5 Chi_31
 | 
					#define Z5 Chi_31
 | 
				
			||||||
#define Z6 Chi_32
 | 
					#define Z6 Chi_32
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -198,86 +199,269 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
  VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
 | 
					  VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
 | 
				
			||||||
  VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
 | 
					  VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MULT_ADD_XYZTa(g0,g1)					\
 | 
				
			||||||
 | 
					  asm ( "movq %0, %%r8 \n\t"					\
 | 
				
			||||||
 | 
						"movq %1, %%r9 \n\t"	 :  : "r"(g0), "r"(g1) : "%r8","%r9");\
 | 
				
			||||||
 | 
						   __asm__ (						\
 | 
				
			||||||
 | 
						   VSHUF(Chi_00,T0)				\
 | 
				
			||||||
 | 
						   VSHUF(Chi_10,T1)						\
 | 
				
			||||||
 | 
						   VMOVIDUP(0,%r8,Z0 )						\
 | 
				
			||||||
 | 
					           VMOVIDUP(3,%r8,Z1 )						\
 | 
				
			||||||
 | 
					           VMOVIDUP(6,%r8,Z2 )						\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T0,UChi_00)					\
 | 
				
			||||||
 | 
						   VMADDSUB(Z1,T0,UChi_01)					\
 | 
				
			||||||
 | 
						   VMADDSUB(Z2,T0,UChi_02)					\
 | 
				
			||||||
 | 
														\
 | 
				
			||||||
 | 
						   VMOVIDUP(0,%r9,Z0 )						\
 | 
				
			||||||
 | 
					           VMOVIDUP(3,%r9,Z1 )						\
 | 
				
			||||||
 | 
					           VMOVIDUP(6,%r9,Z2 )						\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T1,UChi_10)					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T1,UChi_11)            \
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T1,UChi_12)            \
 | 
				
			||||||
 | 
						   							\
 | 
				
			||||||
 | 
													\
 | 
				
			||||||
 | 
						   VMOVRDUP(0,%r8,Z3 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(3,%r8,Z4 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(6,%r8,Z5 )					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_00,UChi_00)/*rr * ir = ri rr*/	\
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_00,UChi_01)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_00,UChi_02)				\
 | 
				
			||||||
 | 
													\
 | 
				
			||||||
 | 
						   VMOVRDUP(0,%r9,Z3 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(3,%r9,Z4 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(6,%r9,Z5 )					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_10,UChi_10)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_10,UChi_11)\
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_10,UChi_12)				\
 | 
				
			||||||
 | 
						   							\
 | 
				
			||||||
 | 
													\
 | 
				
			||||||
 | 
						   VMOVIDUP(1,%r8,Z0 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(4,%r8,Z1 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(7,%r8,Z2 )					\
 | 
				
			||||||
 | 
						   VSHUF(Chi_01,T0)					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T0,UChi_00)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T0,UChi_01)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T0,UChi_02)				\
 | 
				
			||||||
 | 
													\
 | 
				
			||||||
 | 
						   VMOVIDUP(1,%r9,Z0 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(4,%r9,Z1 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(7,%r9,Z2 )					\
 | 
				
			||||||
 | 
						   VSHUF(Chi_11,T1)					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T1,UChi_10)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T1,UChi_11)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T1,UChi_12)				\
 | 
				
			||||||
 | 
													\
 | 
				
			||||||
 | 
						   VMOVRDUP(1,%r8,Z3 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(4,%r8,Z4 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(7,%r8,Z5 )					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_01,UChi_00)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_01,UChi_01)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_01,UChi_02)				\
 | 
				
			||||||
 | 
													\
 | 
				
			||||||
 | 
						   VMOVRDUP(1,%r9,Z3 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(4,%r9,Z4 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(7,%r9,Z5 )					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_11,UChi_10)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_11,UChi_11)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_11,UChi_12)				\
 | 
				
			||||||
 | 
						   							\
 | 
				
			||||||
 | 
						   VSHUF(Chi_02,T0)					\
 | 
				
			||||||
 | 
						   VSHUF(Chi_12,T1)					\
 | 
				
			||||||
 | 
						   VMOVIDUP(2,%r8,Z0 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(5,%r8,Z1 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(8,%r8,Z2 )					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T0,UChi_00)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T0,UChi_01)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T0,UChi_02)			      \
 | 
				
			||||||
 | 
						   VMOVIDUP(2,%r9,Z0 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(5,%r9,Z1 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(8,%r9,Z2 )					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T1,UChi_10)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T1,UChi_11)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T1,UChi_12)			      \
 | 
				
			||||||
 | 
						   /*55*/					      \
 | 
				
			||||||
 | 
						   VMOVRDUP(2,%r8,Z3 )		  \
 | 
				
			||||||
 | 
						   VMOVRDUP(5,%r8,Z4 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(8,%r8,Z5 )				      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_02,UChi_00)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_02,UChi_01)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_02,UChi_02)			      \
 | 
				
			||||||
 | 
						   VMOVRDUP(2,%r9,Z3 )		  \
 | 
				
			||||||
 | 
						   VMOVRDUP(5,%r9,Z4 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(8,%r9,Z5 )				      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_12,UChi_10)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_12,UChi_11)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_12,UChi_12)			      \
 | 
				
			||||||
 | 
						   /*61 insns*/							);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define MULT_ADD_XYZT(g0,g1)					\
 | 
					#define MULT_ADD_XYZT(g0,g1)					\
 | 
				
			||||||
  asm ( "movq %0, %%r8 \n\t"					\
 | 
					  asm ( "movq %0, %%r8 \n\t"					\
 | 
				
			||||||
	"movq %1, %%r9 \n\t"	 :  : "r"(g0), "r"(g1) : "%r8","%r9");\
 | 
						"movq %1, %%r9 \n\t"	 :  : "r"(g0), "r"(g1) : "%r8","%r9");\
 | 
				
			||||||
  __asm__ (							  \
 | 
					  __asm__ (							  \
 | 
				
			||||||
   VSHUFMEM(0,%r8,Z00)		   VSHUFMEM(0,%r9,Z10)	  \
 | 
					  VSHUFMEM(0,%r8,Z00)		   VSHUFMEM(0,%r9,Z10)			\
 | 
				
			||||||
   VRDUP(Chi_00,T1)           VIDUP(Chi_00,Chi_00)	          \
 | 
					  VRDUP(Chi_00,T0)           VIDUP(Chi_00,Chi_00)	          \
 | 
				
			||||||
   VRDUP(Chi_10,T2)           VIDUP(Chi_10,Chi_10)		  \
 | 
					   VRDUP(Chi_10,T1)           VIDUP(Chi_10,Chi_10)		  \
 | 
				
			||||||
   VMUL(Z00,Chi_00,Z1)        VMUL(Z10,Chi_10,Z2)		  \
 | 
					   VMUL(Z00,Chi_00,Z1)        VMUL(Z10,Chi_10,Z2)		  \
 | 
				
			||||||
   VSHUFMEM(3,%r8,Z00)	      VSHUFMEM(3,%r9,Z10)		  \
 | 
					   VSHUFMEM(3,%r8,Z00)	      VSHUFMEM(3,%r9,Z10)		  \    
 | 
				
			||||||
   VMUL(Z00,Chi_00,Z3)        VMUL(Z10,Chi_10,Z4)		  \
 | 
					   VMUL(Z00,Chi_00,Z3)        VMUL(Z10,Chi_10,Z4)		  \
 | 
				
			||||||
   VSHUFMEM(6,%r8,Z00)	      VSHUFMEM(6,%r9,Z10)		  \
 | 
					   VSHUFMEM(6,%r8,Z00)	      VSHUFMEM(6,%r9,Z10)		  \
 | 
				
			||||||
   VMUL(Z00,Chi_00,Z5)        VMUL(Z10,Chi_10,Z6)		  \
 | 
					   VMUL(Z00,Chi_00,Z5)        VMUL(Z10,Chi_10,Z6)		  \
 | 
				
			||||||
   VMADDMEM(0,%r8,T1,UChi_00)  VMADDMEM(0,%r8,T2,UChi_10)		  \
 | 
					   VMADDMEM(0,%r8,T0,UChi_00)  VMADDMEM(0,%r9,T1,UChi_10)		  \
 | 
				
			||||||
   VMADDMEM(3,%r8,T1,UChi_01)  VMADDMEM(3,%r8,T2,UChi_11)		  \
 | 
					   VMADDMEM(3,%r8,T0,UChi_01)  VMADDMEM(3,%r9,T1,UChi_11)		  \
 | 
				
			||||||
   VMADDMEM(6,%r8,T1,UChi_02)  VMADDMEM(6,%r8,T2,UChi_12)		  \
 | 
					   VMADDMEM(6,%r8,T0,UChi_02)  VMADDMEM(6,%r9,T1,UChi_12)		  \
 | 
				
			||||||
   VSHUFMEM(1,%r8,Z00)	      VSHUFMEM(1,%r9,Z10)		  \
 | 
					   VSHUFMEM(1,%r8,Z00)	      VSHUFMEM(1,%r9,Z10)		  \
 | 
				
			||||||
   VRDUP(Chi_01,T1)           VIDUP(Chi_01,Chi_01)		  \
 | 
					   VRDUP(Chi_01,T0)           VIDUP(Chi_01,Chi_01)		  \
 | 
				
			||||||
   VRDUP(Chi_11,T2)           VIDUP(Chi_11,Chi_11)		  \
 | 
					   VRDUP(Chi_11,T1)           VIDUP(Chi_11,Chi_11)		  \
 | 
				
			||||||
   VMADD(Z00,Chi_01,Z1)       VMADD(Z10,Chi_11,Z2)		  \
 | 
					   VMADD(Z00,Chi_01,Z1)       VMADD(Z10,Chi_11,Z2)		  \
 | 
				
			||||||
   VSHUFMEM(4,%r8,Z00)	      VSHUFMEM(4,%r9,Z10)		  \
 | 
					   VSHUFMEM(4,%r8,Z00)	      VSHUFMEM(4,%r9,Z10)		  \
 | 
				
			||||||
   VMADD(Z00,Chi_01,Z3)       VMADD(Z10,Chi_11,Z4)		  \
 | 
					   VMADD(Z00,Chi_01,Z3)       VMADD(Z10,Chi_11,Z4)		  \
 | 
				
			||||||
   VSHUFMEM(7,%r8,Z00)	      VSHUFMEM(7,%r9,Z10)		  \
 | 
					   VSHUFMEM(7,%r8,Z00)	      VSHUFMEM(7,%r9,Z10)		  \
 | 
				
			||||||
   VMADD(Z00,Chi_01,Z5)       VMADD(Z10,Chi_11,Z6)		  \
 | 
					   VMADD(Z00,Chi_01,Z5)       VMADD(Z10,Chi_11,Z6)		  \
 | 
				
			||||||
   VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10)	  \
 | 
					   VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10)	  \
 | 
				
			||||||
   VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11)	  \
 | 
					   VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11)	  \
 | 
				
			||||||
   VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12)	  \
 | 
					   VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12)	  \
 | 
				
			||||||
   VSHUFMEM(2,%r8,Z00)	      VSHUFMEM(2,%r9,Z10)			\
 | 
					   VSHUFMEM(2,%r8,Z00)	      VSHUFMEM(2,%r9,Z10)			\
 | 
				
			||||||
   VRDUP(Chi_02,T1)           VIDUP(Chi_02,Chi_02)			\
 | 
					   VRDUP(Chi_02,T0)           VIDUP(Chi_02,Chi_02)			\
 | 
				
			||||||
   VRDUP(Chi_12,T2)           VIDUP(Chi_12,Chi_12)			\
 | 
					   VRDUP(Chi_12,T1)           VIDUP(Chi_12,Chi_12)			\
 | 
				
			||||||
   VMADD(Z00,Chi_02,Z1)       VMADD(Z10,Chi_12,Z2)		  \
 | 
					   VMADD(Z00,Chi_02,Z1)       VMADD(Z10,Chi_12,Z2)		  \
 | 
				
			||||||
   VSHUFMEM(5,%r8,Z00)	      VSHUFMEM(5,%r9,Z10)		  \
 | 
					   VSHUFMEM(5,%r8,Z00)	      VSHUFMEM(5,%r9,Z10)		  \
 | 
				
			||||||
   VMADD(Z00,Chi_02,Z3)       VMADD(Z10,Chi_12,Z4)		  \
 | 
					   VMADD(Z00,Chi_02,Z3)       VMADD(Z10,Chi_12,Z4)		  \
 | 
				
			||||||
   VSHUFMEM(8,%r8,Z00)	      VSHUFMEM(8,%r9,Z10)		  \
 | 
					   VSHUFMEM(8,%r8,Z00)	      VSHUFMEM(8,%r9,Z10)		  \
 | 
				
			||||||
   VMADD(Z00,Chi_02,Z5)       VMADD(Z10,Chi_12,Z6)		  \
 | 
					   VMADD(Z00,Chi_02,Z5)       VMADD(Z10,Chi_12,Z6)		  \
 | 
				
			||||||
   VMADDSUBMEM(2,%r8,T1,Z1)   VMADDSUBMEM(2,%r8,T2,Z2)		  \
 | 
					   VMADDSUBMEM(2,%r8,T0,Z1)   VMADDSUBMEM(2,%r9,T1,Z2)		  \
 | 
				
			||||||
   VMADDSUBMEM(5,%r8,T1,Z3)   VMADDSUBMEM(5,%r8,T2,Z4)	          \
 | 
					   VMADDSUBMEM(5,%r8,T0,Z3)   VMADDSUBMEM(5,%r9,T1,Z4)	          \
 | 
				
			||||||
   VMADDSUBMEM(8,%r8,T1,Z5)   VMADDSUBMEM(8,%r8,T2,Z6)	       \
 | 
					   VMADDSUBMEM(8,%r8,T0,Z5)   VMADDSUBMEM(8,%r9,T1,Z6)	       \
 | 
				
			||||||
   VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
 | 
					   VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
 | 
				
			||||||
   VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
 | 
					   VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
 | 
				
			||||||
   VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) );
 | 
					   VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
#define MULT_XYZT(g0,g1)					\
 | 
					#define MULT_XYZT(g0,g1)					\
 | 
				
			||||||
 | 
					    asm ( "movq %0, %%r8 \n\t"						\
 | 
				
			||||||
 | 
						"movq %1, %%r9 \n\t" :  : "r"(g0), "r"(g1) : "%r8","%r9" ); \
 | 
				
			||||||
 | 
						   __asm__ (						\
 | 
				
			||||||
 | 
						   VSHUF(Chi_00,T0)				\
 | 
				
			||||||
 | 
						   VSHUF(Chi_10,T1)						\
 | 
				
			||||||
 | 
						   VMOVIDUP(0,%r8,Z0 )						\
 | 
				
			||||||
 | 
					           VMOVIDUP(3,%r8,Z1 )						\
 | 
				
			||||||
 | 
					           VMOVIDUP(6,%r8,Z2 )						\
 | 
				
			||||||
 | 
						   /*6*/							\
 | 
				
			||||||
 | 
					           VMUL(Z0,T0,UChi_00)            \
 | 
				
			||||||
 | 
					           VMUL(Z1,T0,UChi_01)            \
 | 
				
			||||||
 | 
					           VMUL(Z2,T0,UChi_02)            \
 | 
				
			||||||
 | 
						   VMOVIDUP(0,%r9,Z0 )						\
 | 
				
			||||||
 | 
					           VMOVIDUP(3,%r9,Z1 )						\
 | 
				
			||||||
 | 
					           VMOVIDUP(6,%r9,Z2 )						\
 | 
				
			||||||
 | 
					           VMUL(Z0,T1,UChi_10)            \
 | 
				
			||||||
 | 
					           VMUL(Z1,T1,UChi_11)            \
 | 
				
			||||||
 | 
					           VMUL(Z2,T1,UChi_12)            \
 | 
				
			||||||
 | 
						   VMOVRDUP(0,%r8,Z3 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(3,%r8,Z4 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(6,%r8,Z5 )					\
 | 
				
			||||||
 | 
						   /*18*/						\
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_00,UChi_00)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_00,UChi_01)\
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_00,UChi_02) \
 | 
				
			||||||
 | 
						   VMOVRDUP(0,%r9,Z3 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(3,%r9,Z4 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(6,%r9,Z5 )					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_10,UChi_10)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_10,UChi_11)\
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_10,UChi_12)				\
 | 
				
			||||||
 | 
						   VMOVIDUP(1,%r8,Z0 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(4,%r8,Z1 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(7,%r8,Z2 )					\
 | 
				
			||||||
 | 
						   /*28*/						\
 | 
				
			||||||
 | 
						   VSHUF(Chi_01,T0)					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T0,UChi_00)      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T0,UChi_01)       \
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T0,UChi_02)        \
 | 
				
			||||||
 | 
						   VMOVIDUP(1,%r9,Z0 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(4,%r9,Z1 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(7,%r9,Z2 )					\
 | 
				
			||||||
 | 
						   VSHUF(Chi_11,T1)					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T1,UChi_10)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T1,UChi_11)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T1,UChi_12)        \
 | 
				
			||||||
 | 
						   VMOVRDUP(1,%r8,Z3 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(4,%r8,Z4 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(7,%r8,Z5 )					\
 | 
				
			||||||
 | 
					           /*38*/						\
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_01,UChi_00)    \
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_01,UChi_01)    \
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_01,UChi_02)    \
 | 
				
			||||||
 | 
						   VMOVRDUP(1,%r9,Z3 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(4,%r9,Z4 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(7,%r9,Z5 )					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_11,UChi_10)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_11,UChi_11)    \
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_11,UChi_12)				\
 | 
				
			||||||
 | 
						   /*48*/						\
 | 
				
			||||||
 | 
						   VSHUF(Chi_02,T0)					\
 | 
				
			||||||
 | 
						   VSHUF(Chi_12,T1)					\
 | 
				
			||||||
 | 
						   VMOVIDUP(2,%r8,Z0 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(5,%r8,Z1 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(8,%r8,Z2 )					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T0,UChi_00)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T0,UChi_01)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T0,UChi_02)			      \
 | 
				
			||||||
 | 
						   VMOVIDUP(2,%r9,Z0 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(5,%r9,Z1 )					\
 | 
				
			||||||
 | 
						   VMOVIDUP(8,%r9,Z2 )					\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T1,UChi_10)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T1,UChi_11)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T1,UChi_12)			      \
 | 
				
			||||||
 | 
						   /*55*/					      \
 | 
				
			||||||
 | 
						   VMOVRDUP(2,%r8,Z3 )		  \
 | 
				
			||||||
 | 
						   VMOVRDUP(5,%r8,Z4 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(8,%r8,Z5 )				      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_02,UChi_00)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_02,UChi_01)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_02,UChi_02)			      \
 | 
				
			||||||
 | 
						   VMOVRDUP(2,%r9,Z3 )		  \
 | 
				
			||||||
 | 
						   VMOVRDUP(5,%r9,Z4 )					\
 | 
				
			||||||
 | 
						   VMOVRDUP(8,%r9,Z5 )				      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_12,UChi_10)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_12,UChi_11)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_12,UChi_12)			      \
 | 
				
			||||||
 | 
						   /*61 insns*/							);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MULT_XYZTa(g0,g1)					\
 | 
				
			||||||
  asm ( "movq %0, %%r8 \n\t"					\
 | 
					  asm ( "movq %0, %%r8 \n\t"					\
 | 
				
			||||||
	"movq %1, %%r9 \n\t" :  : "r"(g0), "r"(g1) : "%r8","%r9" ); \
 | 
						"movq %1, %%r9 \n\t" :  : "r"(g0), "r"(g1) : "%r8","%r9" ); \
 | 
				
			||||||
  __asm__ (							  \
 | 
					  __asm__ (							  \
 | 
				
			||||||
   VSHUFMEM(0,%r8,Z00)		   VSHUFMEM(0,%r9,Z10)	  \
 | 
					   VSHUFMEM(0,%r8,Z00)		   VSHUFMEM(0,%r9,Z10)	  \
 | 
				
			||||||
   VRDUP(Chi_00,T1)           VIDUP(Chi_00,Chi_00)	          \
 | 
					   VRDUP(Chi_00,T0)           VIDUP(Chi_00,Chi_00)	          \
 | 
				
			||||||
   VRDUP(Chi_10,T2)           VIDUP(Chi_10,Chi_10)		  \
 | 
					   VRDUP(Chi_10,T1)           VIDUP(Chi_10,Chi_10)		  \
 | 
				
			||||||
   VMUL(Z00,Chi_00,Z1)        VMUL(Z10,Chi_10,Z2)		  \
 | 
					   VMUL(Z00,Chi_00,Z1)        VMUL(Z10,Chi_10,Z2)		  \
 | 
				
			||||||
   VSHUFMEM(3,%r8,Z00)	      VSHUFMEM(3,%r9,Z10)		  \
 | 
					   VSHUFMEM(3,%r8,Z00)	      VSHUFMEM(3,%r9,Z10)		  \
 | 
				
			||||||
   VMUL(Z00,Chi_00,Z3)        VMUL(Z10,Chi_10,Z4)		  \
 | 
					   VMUL(Z00,Chi_00,Z3)        VMUL(Z10,Chi_10,Z4)		  \
 | 
				
			||||||
   VSHUFMEM(6,%r8,Z00)	      VSHUFMEM(6,%r9,Z10)		  \
 | 
					   VSHUFMEM(6,%r8,Z00)	      VSHUFMEM(6,%r9,Z10)		  \
 | 
				
			||||||
   VMUL(Z00,Chi_00,Z5)        VMUL(Z10,Chi_10,Z6)		  \
 | 
					   VMUL(Z00,Chi_00,Z5)        VMUL(Z10,Chi_10,Z6)		  \
 | 
				
			||||||
   VMULMEM(0,%r8,T1,UChi_00)  VMULMEM(0,%r8,T2,UChi_10)		  \
 | 
					   VMULMEM(0,%r8,T0,UChi_00)  VMULMEM(0,%r9,T1,UChi_10)		  \
 | 
				
			||||||
   VMULMEM(3,%r8,T1,UChi_01)  VMULMEM(3,%r8,T2,UChi_11)		  \
 | 
					   VMULMEM(3,%r8,T0,UChi_01)  VMULMEM(3,%r9,T1,UChi_11)		  \
 | 
				
			||||||
   VMULMEM(6,%r8,T1,UChi_02)  VMULMEM(6,%r8,T2,UChi_12)		  \
 | 
					   VMULMEM(6,%r8,T0,UChi_02)  VMULMEM(6,%r9,T1,UChi_12)		  \
 | 
				
			||||||
   VSHUFMEM(1,%r8,Z00)	      VSHUFMEM(1,%r9,Z10)		  \
 | 
					   VSHUFMEM(1,%r8,Z00)	      VSHUFMEM(1,%r9,Z10)		  \
 | 
				
			||||||
   VRDUP(Chi_01,T1)           VIDUP(Chi_01,Chi_01)		  \
 | 
					   VRDUP(Chi_01,T0)           VIDUP(Chi_01,Chi_01)		  \
 | 
				
			||||||
   VRDUP(Chi_11,T2)           VIDUP(Chi_11,Chi_11)		  \
 | 
					   VRDUP(Chi_11,T1)           VIDUP(Chi_11,Chi_11)		  \
 | 
				
			||||||
   VMADD(Z00,Chi_01,Z1)       VMADD(Z10,Chi_11,Z2)		  \
 | 
					   VMADD(Z00,Chi_01,Z1)       VMADD(Z10,Chi_11,Z2)		  \
 | 
				
			||||||
   VSHUFMEM(4,%r8,Z00)	      VSHUFMEM(4,%r9,Z10)		  \
 | 
					   VSHUFMEM(4,%r8,Z00)	      VSHUFMEM(4,%r9,Z10)		  \
 | 
				
			||||||
   VMADD(Z00,Chi_01,Z3)       VMADD(Z10,Chi_11,Z4)		  \
 | 
					   VMADD(Z00,Chi_01,Z3)       VMADD(Z10,Chi_11,Z4)		  \
 | 
				
			||||||
   VSHUFMEM(7,%r8,Z00)	      VSHUFMEM(7,%r9,Z10)		  \
 | 
					   VSHUFMEM(7,%r8,Z00)	      VSHUFMEM(7,%r9,Z10)		  \
 | 
				
			||||||
   VMADD(Z00,Chi_01,Z5)       VMADD(Z10,Chi_11,Z6)		  \
 | 
					   VMADD(Z00,Chi_01,Z5)       VMADD(Z10,Chi_11,Z6)		  \
 | 
				
			||||||
   VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10)	  \
 | 
					   VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10)	  \
 | 
				
			||||||
   VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11)	  \
 | 
					   VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11)	  \
 | 
				
			||||||
   VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12)	  \
 | 
					   VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12)	  \
 | 
				
			||||||
   VSHUFMEM(2,%r8,Z00)	      VSHUFMEM(2,%r9,Z10)			\
 | 
					   VSHUFMEM(2,%r8,Z00)	      VSHUFMEM(2,%r9,Z10)			\
 | 
				
			||||||
   VRDUP(Chi_02,T1)           VIDUP(Chi_02,Chi_02)			\
 | 
					   VRDUP(Chi_02,T0)           VIDUP(Chi_02,Chi_02)			\
 | 
				
			||||||
   VRDUP(Chi_12,T2)           VIDUP(Chi_12,Chi_12)			\
 | 
					   VRDUP(Chi_12,T1)           VIDUP(Chi_12,Chi_12)			\
 | 
				
			||||||
   VMADD(Z00,Chi_02,Z1)       VMADD(Z10,Chi_12,Z2)		  \
 | 
					   VMADD(Z00,Chi_02,Z1)       VMADD(Z10,Chi_12,Z2)		  \
 | 
				
			||||||
   VSHUFMEM(5,%r8,Z00)	      VSHUFMEM(5,%r9,Z10)		  \
 | 
					   VSHUFMEM(5,%r8,Z00)	      VSHUFMEM(5,%r9,Z10)		  \
 | 
				
			||||||
   VMADD(Z00,Chi_02,Z3)       VMADD(Z10,Chi_12,Z4)		  \
 | 
					   VMADD(Z00,Chi_02,Z3)       VMADD(Z10,Chi_12,Z4)		  \
 | 
				
			||||||
   VSHUFMEM(8,%r8,Z00)	      VSHUFMEM(8,%r9,Z10)		  \
 | 
					   VSHUFMEM(8,%r8,Z00)	      VSHUFMEM(8,%r9,Z10)		  \
 | 
				
			||||||
   VMADD(Z00,Chi_02,Z5)       VMADD(Z10,Chi_12,Z6)		  \
 | 
					   VMADD(Z00,Chi_02,Z5)       VMADD(Z10,Chi_12,Z6)		  \
 | 
				
			||||||
   VMADDSUBMEM(2,%r8,T1,Z1)   VMADDSUBMEM(2,%r8,T2,Z2)		  \
 | 
					   VMADDSUBMEM(2,%r8,T0,Z1)   VMADDSUBMEM(2,%r9,T1,Z2)		  \
 | 
				
			||||||
   VMADDSUBMEM(5,%r8,T1,Z3)   VMADDSUBMEM(5,%r8,T2,Z4)	          \
 | 
					   VMADDSUBMEM(5,%r8,T0,Z3)   VMADDSUBMEM(5,%r9,T1,Z4)	          \
 | 
				
			||||||
   VMADDSUBMEM(8,%r8,T1,Z5)   VMADDSUBMEM(8,%r8,T2,Z6)	       \
 | 
					   VMADDSUBMEM(8,%r8,T0,Z5)   VMADDSUBMEM(8,%r9,T1,Z6)	       \
 | 
				
			||||||
   VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
 | 
					   VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
 | 
				
			||||||
   VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
 | 
					   VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
 | 
				
			||||||
   VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) );
 | 
					   VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) );
 | 
				
			||||||
@@ -383,24 +567,28 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
 | 
				
			|||||||
  SE0=st.GetEntry(ptype,X+skew,sF);			\
 | 
					  SE0=st.GetEntry(ptype,X+skew,sF);			\
 | 
				
			||||||
  o0 = SE0->_offset;					\
 | 
					  o0 = SE0->_offset;					\
 | 
				
			||||||
  l0 = SE0->_is_local;					\
 | 
					  l0 = SE0->_is_local;					\
 | 
				
			||||||
 | 
					  p0 = SE0->_permute;					\
 | 
				
			||||||
  addr0 = l0 ?  (uint64_t) &in._odata[o0] : (uint64_t) &buf[o0];	\
 | 
					  addr0 = l0 ?  (uint64_t) &in._odata[o0] : (uint64_t) &buf[o0];	\
 | 
				
			||||||
  PF_CHI(addr0);							\
 | 
					  PF_CHI(addr0);							\
 | 
				
			||||||
									\
 | 
														\
 | 
				
			||||||
  SE1=st.GetEntry(ptype,Y+skew,sF);			\
 | 
					  SE1=st.GetEntry(ptype,Y+skew,sF);			\
 | 
				
			||||||
  o1 = SE1->_offset;					\
 | 
					  o1 = SE1->_offset;					\
 | 
				
			||||||
  l1 = SE1->_is_local;					\
 | 
					  l1 = SE1->_is_local;					\
 | 
				
			||||||
 | 
					  p1 = SE1->_permute;					\
 | 
				
			||||||
  addr1 = l1 ?  (uint64_t) &in._odata[o1] : (uint64_t) &buf[o1];	\
 | 
					  addr1 = l1 ?  (uint64_t) &in._odata[o1] : (uint64_t) &buf[o1];	\
 | 
				
			||||||
  PF_CHI(addr1);							\
 | 
					  PF_CHI(addr1);							\
 | 
				
			||||||
									\
 | 
														\
 | 
				
			||||||
  SE2=st.GetEntry(ptype,Z+skew,sF);			\
 | 
					  SE2=st.GetEntry(ptype,Z+skew,sF);			\
 | 
				
			||||||
  o2 = SE2->_offset;					\
 | 
					  o2 = SE2->_offset;					\
 | 
				
			||||||
  l2 = SE2->_is_local;					\
 | 
					  l2 = SE2->_is_local;					\
 | 
				
			||||||
 | 
					  p2 = SE2->_permute;					\
 | 
				
			||||||
  addr2 = l2 ?  (uint64_t) &in._odata[o2] : (uint64_t) &buf[o2];	\
 | 
					  addr2 = l2 ?  (uint64_t) &in._odata[o2] : (uint64_t) &buf[o2];	\
 | 
				
			||||||
  PF_CHI(addr2);							\
 | 
					  PF_CHI(addr2);							\
 | 
				
			||||||
									\
 | 
														\
 | 
				
			||||||
  SE3=st.GetEntry(ptype,T+skew,sF);			\
 | 
					  SE3=st.GetEntry(ptype,T+skew,sF);			\
 | 
				
			||||||
  o3 = SE3->_offset;					\
 | 
					  o3 = SE3->_offset;					\
 | 
				
			||||||
  l3 = SE3->_is_local;					\
 | 
					  l3 = SE3->_is_local;					\
 | 
				
			||||||
 | 
					  p3 = SE3->_permute;					\
 | 
				
			||||||
  addr3 = l3 ?  (uint64_t) &in._odata[o3] : (uint64_t) &buf[o3];	\
 | 
					  addr3 = l3 ?  (uint64_t) &in._odata[o3] : (uint64_t) &buf[o3];	\
 | 
				
			||||||
  PF_CHI(addr3);							\
 | 
					  PF_CHI(addr3);							\
 | 
				
			||||||
  									\
 | 
					  									\
 | 
				
			||||||
@@ -501,6 +689,27 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
 | 
				
			|||||||
#endif
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PERMUTE_DIR3 __asm__ (	\
 | 
				
			||||||
 | 
					  VPERM3(Chi_00,Chi_00)	\
 | 
				
			||||||
 | 
					  VPERM3(Chi_01,Chi_01)	\
 | 
				
			||||||
 | 
					  VPERM3(Chi_02,Chi_02)	);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PERMUTE_DIR2 __asm__ (	\
 | 
				
			||||||
 | 
					  VPERM2(Chi_10,Chi_10)	\
 | 
				
			||||||
 | 
					  VPERM2(Chi_11,Chi_11)	\
 | 
				
			||||||
 | 
					  VPERM2(Chi_12,Chi_12) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PERMUTE_DIR1 __asm__ (	\
 | 
				
			||||||
 | 
					  VPERM1(Chi_00,Chi_00)	\
 | 
				
			||||||
 | 
					  VPERM1(Chi_01,Chi_01)	\
 | 
				
			||||||
 | 
					  VPERM1(Chi_02,Chi_02)	);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PERMUTE_DIR0 __asm__ (			\
 | 
				
			||||||
 | 
					  VPERM0(Chi_10,Chi_10)	\
 | 
				
			||||||
 | 
					  VPERM0(Chi_11,Chi_11)	\
 | 
				
			||||||
 | 
					  VPERM0(Chi_12,Chi_12) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // This is the single precision 5th direction vectorised kernel
 | 
					  // This is the single precision 5th direction vectorised kernel
 | 
				
			||||||
#include <simd/Intel512single.h>
 | 
					#include <simd/Intel512single.h>
 | 
				
			||||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 | 
					template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 | 
				
			||||||
@@ -523,29 +732,115 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 | 
				
			|||||||
  StencilEntry *SE3;
 | 
					  StencilEntry *SE3;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Xp, Yp, Zp, Tp
 | 
					  // Xp, Yp, Zp, Tp
 | 
				
			||||||
 | 
					 | 
				
			||||||
  PREPARE(Xp,Yp,Zp,Tp,0,U);
 | 
					  PREPARE(Xp,Yp,Zp,Tp,0,U);
 | 
				
			||||||
  LOAD_CHIa(addr0,addr1);
 | 
					  LOAD_CHIa(addr0,addr1);
 | 
				
			||||||
 | 
					  if (l0&&p0) {     PERMUTE_DIR3;  }
 | 
				
			||||||
 | 
					  if (l1&&p1) {     PERMUTE_DIR2;  }
 | 
				
			||||||
  MULT_XYZT(gauge0,gauge1);  
 | 
					  MULT_XYZT(gauge0,gauge1);  
 | 
				
			||||||
  LOAD_CHIa(addr2,addr3);
 | 
					  LOAD_CHIa(addr2,addr3);
 | 
				
			||||||
  MULT_XYZT(gauge2,gauge3);  
 | 
					  if (l2&&p2) {     PERMUTE_DIR1;  }
 | 
				
			||||||
 | 
					  if (l3&&p3) {     PERMUTE_DIR0;  }
 | 
				
			||||||
 | 
					  MULT_ADD_XYZT(gauge2,gauge3);  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  PREPARE(Xm,Ym,Zm,Tm,0,U);
 | 
					  PREPARE(Xm,Ym,Zm,Tm,0,U);
 | 
				
			||||||
  LOAD_CHIa(addr0,addr1);
 | 
					  LOAD_CHIa(addr0,addr1);
 | 
				
			||||||
 | 
					  if (l0&&p0) {     PERMUTE_DIR3;  }
 | 
				
			||||||
 | 
					  if (l1&&p1) {     PERMUTE_DIR2;  }
 | 
				
			||||||
  MULT_ADD_XYZT(gauge0,gauge1);  
 | 
					  MULT_ADD_XYZT(gauge0,gauge1);  
 | 
				
			||||||
  LOAD_CHIa(addr2,addr3);
 | 
					  LOAD_CHIa(addr2,addr3);
 | 
				
			||||||
 | 
					  if (l2&&p2) {     PERMUTE_DIR1;  }
 | 
				
			||||||
 | 
					  if (l3&&p3) {     PERMUTE_DIR0;  }
 | 
				
			||||||
  MULT_ADD_XYZT(gauge2,gauge3);  
 | 
					  MULT_ADD_XYZT(gauge2,gauge3);  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  PREPARE(Xp,Yp,Zp,Tp,8,UUU);
 | 
					  PREPARE(Xp,Yp,Zp,Tp,8,UUU);
 | 
				
			||||||
  LOAD_CHIa(addr0,addr1);
 | 
					  LOAD_CHIa(addr0,addr1);
 | 
				
			||||||
 | 
					  if (l0&&p0) {     PERMUTE_DIR3;  }
 | 
				
			||||||
 | 
					  if (l1&&p1) {     PERMUTE_DIR2;  }
 | 
				
			||||||
  MULT_ADD_XYZT(gauge0,gauge1);  
 | 
					  MULT_ADD_XYZT(gauge0,gauge1);  
 | 
				
			||||||
  LOAD_CHIa(addr2,addr3);
 | 
					  LOAD_CHIa(addr2,addr3);
 | 
				
			||||||
 | 
					  if (l2&&p2) {     PERMUTE_DIR1;  }
 | 
				
			||||||
 | 
					  if (l3&&p3) {     PERMUTE_DIR0;  }
 | 
				
			||||||
  MULT_ADD_XYZT(gauge2,gauge3);  
 | 
					  MULT_ADD_XYZT(gauge2,gauge3);  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  PREPARE(Xm,Ym,Zm,Tm,8,UUU);
 | 
					  PREPARE(Xm,Ym,Zm,Tm,8,UUU);
 | 
				
			||||||
  LOAD_CHIa(addr0,addr1);
 | 
					  LOAD_CHIa(addr0,addr1);
 | 
				
			||||||
 | 
					  if (l0&&p0) {     PERMUTE_DIR3;  }
 | 
				
			||||||
 | 
					  if (l1&&p1) {     PERMUTE_DIR2;  }
 | 
				
			||||||
  MULT_ADD_XYZT(gauge0,gauge1);  
 | 
					  MULT_ADD_XYZT(gauge0,gauge1);  
 | 
				
			||||||
  LOAD_CHIa(addr2,addr3);
 | 
					  LOAD_CHIa(addr2,addr3);
 | 
				
			||||||
 | 
					  if (l2&&p2) {     PERMUTE_DIR1;  }
 | 
				
			||||||
 | 
					  if (l3&&p3) {     PERMUTE_DIR0;  }
 | 
				
			||||||
 | 
					  MULT_ADD_XYZT(gauge2,gauge3);  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  addr0 = (uint64_t) &out;
 | 
				
			||||||
 | 
					  REDUCEa(addr0);
 | 
				
			||||||
 | 
					#else 
 | 
				
			||||||
 | 
					  assert(0);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // This is the single precision 5th direction vectorised kernel
 | 
				
			||||||
 | 
					#include <simd/Intel512double.h>
 | 
				
			||||||
 | 
					template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 | 
				
			||||||
 | 
												       DoubledGaugeField &U,
 | 
				
			||||||
 | 
												       DoubledGaugeField &UUU,
 | 
				
			||||||
 | 
												       SiteSpinor *buf, int sF,
 | 
				
			||||||
 | 
												       int sU, const FermionField &in, SiteSpinor &out) 
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					#ifdef AVX512
 | 
				
			||||||
 | 
					  uint64_t gauge0,gauge1,gauge2,gauge3;
 | 
				
			||||||
 | 
					  uint64_t addr0,addr1,addr2,addr3;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int o0,o1,o2,o3; // offsets
 | 
				
			||||||
 | 
					  int l0,l1,l2,l3; // local 
 | 
				
			||||||
 | 
					  int p0,p1,p2,p3; // perm
 | 
				
			||||||
 | 
					  int ptype;
 | 
				
			||||||
 | 
					  StencilEntry *SE0;
 | 
				
			||||||
 | 
					  StencilEntry *SE1;
 | 
				
			||||||
 | 
					  StencilEntry *SE2;
 | 
				
			||||||
 | 
					  StencilEntry *SE3;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Xp, Yp, Zp, Tp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  PREPARE(Xp,Yp,Zp,Tp,0,U);
 | 
				
			||||||
 | 
					  LOAD_CHIa(addr0,addr1);
 | 
				
			||||||
 | 
					  if (p0) {     PERMUTE_DIR3;  }
 | 
				
			||||||
 | 
					  if (p1) {     PERMUTE_DIR2;  }
 | 
				
			||||||
 | 
					  MULT_XYZT(gauge0,gauge1);  
 | 
				
			||||||
 | 
					  LOAD_CHIa(addr2,addr3);
 | 
				
			||||||
 | 
					  if (p2) {     PERMUTE_DIR1;  }
 | 
				
			||||||
 | 
					  if (p3) {     PERMUTE_DIR0;  }
 | 
				
			||||||
 | 
					  MULT_ADD_XYZT(gauge2,gauge3);  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  PREPARE(Xm,Ym,Zm,Tm,0,U);
 | 
				
			||||||
 | 
					  LOAD_CHIa(addr0,addr1);
 | 
				
			||||||
 | 
					  if (p0) {     PERMUTE_DIR3;  }
 | 
				
			||||||
 | 
					  if (p1) {     PERMUTE_DIR2;  }
 | 
				
			||||||
 | 
					  MULT_ADD_XYZT(gauge0,gauge1);  
 | 
				
			||||||
 | 
					  LOAD_CHIa(addr2,addr3);
 | 
				
			||||||
 | 
					  if (p2) {     PERMUTE_DIR1;  }
 | 
				
			||||||
 | 
					  if (p3) {     PERMUTE_DIR0;  }
 | 
				
			||||||
 | 
					  MULT_ADD_XYZT(gauge2,gauge3);  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  PREPARE(Xp,Yp,Zp,Tp,8,UUU);
 | 
				
			||||||
 | 
					  LOAD_CHIa(addr0,addr1);
 | 
				
			||||||
 | 
					  if (p0) {     PERMUTE_DIR3;  }
 | 
				
			||||||
 | 
					  if (p1) {     PERMUTE_DIR2;  }
 | 
				
			||||||
 | 
					  MULT_ADD_XYZT(gauge0,gauge1);  
 | 
				
			||||||
 | 
					  LOAD_CHIa(addr2,addr3);
 | 
				
			||||||
 | 
					  if (p2) {     PERMUTE_DIR1;  }
 | 
				
			||||||
 | 
					  if (p3) {     PERMUTE_DIR0;  }
 | 
				
			||||||
 | 
					  MULT_ADD_XYZT(gauge2,gauge3);  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  PREPARE(Xm,Ym,Zm,Tm,8,UUU);
 | 
				
			||||||
 | 
					  LOAD_CHIa(addr0,addr1);
 | 
				
			||||||
 | 
					  if (p0) {     PERMUTE_DIR3;  }
 | 
				
			||||||
 | 
					  if (p1) {     PERMUTE_DIR2;  }
 | 
				
			||||||
 | 
					  MULT_ADD_XYZT(gauge0,gauge1);  
 | 
				
			||||||
 | 
					  LOAD_CHIa(addr2,addr3);
 | 
				
			||||||
 | 
					  if (p2) {     PERMUTE_DIR1;  }
 | 
				
			||||||
 | 
					  if (p3) {     PERMUTE_DIR0;  }
 | 
				
			||||||
  MULT_ADD_XYZT(gauge2,gauge3);  
 | 
					  MULT_ADD_XYZT(gauge2,gauge3);  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  addr0 = (uint64_t) &out;
 | 
					  addr0 = (uint64_t) &out;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -68,12 +68,14 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  FermionField src   (FGrid);
 | 
					  FermionField src   (FGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //random(pRNG5,src);
 | 
					  random(pRNG5,src);
 | 
				
			||||||
 | 
					  /*
 | 
				
			||||||
  std::vector<int> site({0,0,0,0,0});
 | 
					  std::vector<int> site({0,0,0,0,0});
 | 
				
			||||||
  ColourVector cv = zero;
 | 
					  ColourVector cv = zero;
 | 
				
			||||||
  cv()()(0)=1.0;
 | 
					  cv()()(0)=1.0;
 | 
				
			||||||
  src = zero;
 | 
					  src = zero;
 | 
				
			||||||
  pokeSite(cv,src,site);
 | 
					  pokeSite(cv,src,site);
 | 
				
			||||||
 | 
					  */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  FermionField result(FGrid); result=zero;
 | 
					  FermionField result(FGrid); result=zero;
 | 
				
			||||||
  FermionField    tmp(FGrid);    tmp=zero;
 | 
					  FermionField    tmp(FGrid);    tmp=zero;
 | 
				
			||||||
@@ -81,8 +83,15 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  FermionField phi   (FGrid); random(pRNG5,phi);
 | 
					  FermionField phi   (FGrid); random(pRNG5,phi);
 | 
				
			||||||
  FermionField chi   (FGrid); random(pRNG5,chi);
 | 
					  FermionField chi   (FGrid); random(pRNG5,chi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  LatticeGaugeField Umu(UGrid); SU3::ColdConfiguration(pRNG4,Umu);
 | 
					  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG4,Umu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /*
 | 
				
			||||||
 | 
					  for(int mu=1;mu<4;mu++){
 | 
				
			||||||
 | 
					    auto tmp = PeekIndex<LorentzIndex>(Umu,mu);
 | 
				
			||||||
 | 
					        tmp = zero;
 | 
				
			||||||
 | 
					    PokeIndex<LorentzIndex>(Umu,tmp,mu);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  */
 | 
				
			||||||
  double volume=Ls;
 | 
					  double volume=Ls;
 | 
				
			||||||
  for(int mu=0;mu<Nd;mu++){
 | 
					  for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
    volume=volume*latt_size[mu];
 | 
					    volume=volume*latt_size[mu];
 | 
				
			||||||
@@ -117,7 +126,20 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  std::cout<<GridLogMessage << "Calling vectorised staggered operator"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Calling vectorised staggered operator"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
 | 
				
			||||||
 | 
					  t0=usecond();
 | 
				
			||||||
 | 
					  for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					    Ds.Dhop(src,tmp,0);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  t1=usecond();
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  err = tmp-result; 
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
  FermionField ssrc  (sFGrid);  localConvert(src,ssrc);
 | 
					  FermionField ssrc  (sFGrid);  localConvert(src,ssrc);
 | 
				
			||||||
  FermionField sresult(sFGrid); sresult=zero;
 | 
					  FermionField sresult(sFGrid); sresult=zero;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -133,6 +155,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
 | 
					  std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
					  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
 | 
					  QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  err = tmp-result; 
 | 
					  err = tmp-result; 
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user