Hadrons: moving Hadrons to root directory, build system improvements

2025-08-10 08:27:04 +01:00 · 2018-08-28 15:00:40 +01:00
parent 5f206df775
commit fb7d021b9d
499 changed files with 429 additions and 846 deletions
--- a/Grid/simd/BGQQPX.h
+++ b/Grid/simd/BGQQPX.h
@@ -0,0 +1,796 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/BGQQPX.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_BGQ_QPX_H
+#define GRID_ASM_BGQ_QPX_H
+
+#include <stddint.h>
+
+/*********************************************************
+ * Architectural macros
+ *********************************************************/
+#define VLOADf(OFF,PTR,DEST)         "qvlfsux     " #DEST  "," #OFF "," #PTR ") ;\n"
+#define VLOADd(OFF,PTR,DEST)         "qvlfdux     " #DEST  "," #OFF "," #PTR ") ;\n"
+#define VSTOREf(OFF,PTR,SRC)         "qvstfsux    " #SRC  "," #OFF "," #PTR ") ;\n"
+#define VSTOREd(OFF,PTR,SRC)         "qvstfdux    " #SRC  "," #OFF "," #PTR ") ;\n"
+#define VSPLATf(A,B,DEST)            "qvlfcdxa    " #A "," #B "," #DEST  ";\n"
+#define VSPLATd(A,B,DEST)            "qvlfcsxa    " #A "," #B "," #DEST  ";\n"
+
+#define LOAD64(A,ptr)  
+#define VZERO(DEST)                  "qvfclr      " #DEST "; \n"
+#define VONE (DEST)                  "qvfset      " #DEST "; \n" 
+#define VNEG (SRC,DEST)              "qvfneg      " #DEST "," #SRC "; \n"
+#define VMOV(A,DEST)                 "qvfmr       " #DEST, "," #A   ";\n"
+
+#define VADD(A,B,DEST)               "qvfadd      " #DEST "," #A "," #B  ";\n"
+#define VSUB(A,B,DEST)               "qvfsub      " #DEST "," #A "," #B  ";\n"
+#define VMUL(A,B,DEST)               "qvfmul      " #DEST "," #A "," #B  ";\n"
+#define VMUL_RR_RI(A,B,DEST)         "qvfxmul     " #DEST "," #A "," #B  ";\n" 
+#define VMADD(A,B,C,DEST)            "qvfmadd     " #DEST "," #A "," #B ","#C ";\n"
+#define VMADD_RR_RI(A,B,C,DEST)      "qvfxmadd    " #DEST "," #A "," #B ","#C ";\n" 
+#define VMADD_MII_IR(A,B,C,DEST)     "qvfxxnpmadd " #DEST "," #A "," #B ","#C ";\n" 
+#define VMADD_II_MIR(A,B,C,DEST)     "qvfmadd     " #DEST "," #A "," #B ","#C ";\n"  
+
+#define CACHE_LOCK  (PTR)    asm (" dcbtls  %%r0, %0 \n" : : "r" (PTR) );
+#define CACHE_UNLOCK(PTR)    asm (" dcblc   %%r0, %0 \n" : : "r" (PTR) );
+#define CACHE_FLUSH (PTR)    asm (" dcbf    %%r0, %0 \n" : : "r" (PTR) );
+#define CACHE_TOUCH (PTR)    asm (" dcbt    %%r0, %0 \n" : : "r" (PTR) );
+
+// Gauge field locking 2 x 9 complex == 18*8 / 16 bytes per link
+// This is 144/288 bytes == 4.5; 9 lines
+#define MASK_REGS   /*NOOP ON BGQ*/
+#define PF_GAUGE(A) /*NOOP ON BGQ*/
+#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
+#define PREFETCH_CHIMU(base) /*NOOP ON BGQ*/
+
+/*********************************************************
+ * Register definitions
+ *********************************************************/
+#define psi_00 0
+#define psi_01 1
+#define psi_02 2
+  
+#define psi_10 3
+#define psi_11 4
+#define psi_12 5
+
+#define psi_20 6
+#define psi_21 7
+#define psi_22 8
+
+#define psi_30 9
+#define psi_31 10
+#define psi_32 11
+
+#define Chi_00 12
+#define Chi_01 13
+#define Chi_02 14
+
+#define Chi_10 15
+#define Chi_11 16
+#define Chi_12 17  
+
+#define UChi_00 18 
+#define UChi_01 19
+#define UChi_02 20
+
+#define UChi_10 21
+#define UChi_11 22
+#define UChi_12 23 
+
+#define U0 24
+#define U1 25
+#define U2 26
+#define one 27
+
+#define REP  %%r16
+#define IMM  %%r17
+
+/*Alias regs*/
+#define Chimu_00 Chi_00
+#define Chimu_01 Chi_01
+#define Chimu_02 Chi_02
+#define Chimu_10 Chi_10
+#define Chimu_11 Chi_11
+#define Chimu_12 Chi_02
+#define Chimu_20 UChi_00
+#define Chimu_21 UChi_01
+#define Chimu_22 UChi_02
+#define Chimu_30 UChi_10
+#define Chimu_31 UChi_11
+#define Chimu_32 UChi_02
+
+/*********************************************************
+ * Macro sequences encoding QCD
+ *********************************************************/
+#define LOCK_GAUGE(dir)							\
+  {									\
+    uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir);			\
+    for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){			\
+      CACHE_LOCK(&byte_addr[i]);					\
+    }									\
+  }
+
+#define UNLOCK_GAUGE(dir)						\
+  {									\
+    uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir);			\
+    for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){			\
+      CACHE_UNLOCK(&byte_addr[i]);					\
+    }									\
+  }
+
+#define MAYBEPERM(A,B)
+
+#define PERMUTE_DIR3 
+#define PERMUTE_DIR2 
+#define PERMUTE_DIR1 
+#define PERMUTE_DIR0 
+
+#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
+
+#define MULT_SPIN(ptr,p) {					\
+    uint64_t ub = ((uint64_t)base);				\
+    asm (							\
+         VLOAD(%0,%3,U0)					\
+         VLOAD(%1,%3,U1)					\
+         VLOAD(%2,%3,U2)					\
+	 VMUL_RR_RI(U0,Chi_00,UChi_00)					\
+	 VMUL_RR_RI(U1,Chi_00,UChi_01)					\
+	 VMUL_RR_RI(U2,Chi_00,UChi_02)					\
+	 VMUL_RR_RI(U0,Chi_10,UChi_10)					\
+	 VMUL_RR_RI(U1,Chi_10,UChi_11)					\
+	 VMUL_RR_RI(U2,Chi_10,UChi_12)					\
+	 VMADD_MII_IR(U0,Chi_00,UChi_00,UChi_00)			\
+	 VMADD_MII_IR(U1,Chi_00,UChi_01,UChi_01)			\
+	 VMADD_MII_IR(U2,Chi_00,UChi_02,UChi_02)			\
+	 VMADD_MII_IR(U0,Chi_10,UChi_10,UChi_10)			\
+	 VMADD_MII_IR(U1,Chi_10,UChi_11,UChi_11)			\
+	 VMADD_MII_IR(U2,Chi_10,UChi_12,UChi_12)			\
+	 : : "r" (0), "r" (32*3), "r" (32*6), "r" (ub ));		\
+    asm (								\
+         VLOAD(%0,%3,U0)						\
+         VLOAD(%1,%3,U1)						\
+         VLOAD(%2,%3,U2)						\
+	 VMADD_RR_RI(U0,Chi_01,UChi_00,UChi_00)				\
+	 VMADD_RR_RI(U1,Chi_01,UChi_01,UChi_01)				\
+	 VMADD_RR_RI(U2,Chi_01,UChi_02,UChi_02)				\
+	 VMADD_RR_RI(U0,Chi_11,UChi_10,UChi_10)				\
+	 VMADD_RR_RI(U1,Chi_11,UChi_11,UChi_11)				\
+	 VMADD_RR_RI(U2,Chi_11,UChi_12,UChi_12)				\
+	 VMADD_MII_IR(U0,Chi_01,UChi_00,UChi_00)			\
+	 VMADD_MII_IR(U1,Chi_01,UChi_01,UChi_01)			\
+	 VMADD_MII_IR(U2,Chi_01,UChi_02,UChi_02)			\
+	 VMADD_MII_IR(U0,Chi_11,UChi_10,UChi_10)			\
+	 VMADD_MII_IR(U1,Chi_11,UChi_11,UChi_11)			\
+	 VMADD_MII_IR(U2,Chi_11,UChi_12,UChi_12)			\
+	 : : "r" (32), "r" (32*4), "r" (32*7), "r" (ub ));		\
+    asm (								\
+         VLOAD(%0,%3,U0)						\
+         VLOAD(%1,%3,U1)						\
+         VLOAD(%2,%3,U2)						\
+	 VMADD_RR_RI(U0,Chi_02,UChi_00,UChi_00)				\
+	 VMADD_RR_RI(U1,Chi_02,UChi_01,UChi_01)				\
+	 VMADD_RR_RI(U2,Chi_02,UChi_02,UChi_02)				\
+	 VMADD_RR_RI(U0,Chi_12,UChi_10,UChi_10)				\
+	 VMADD_RR_RI(U1,Chi_12,UChi_11,UChi_11)				\
+	 VMADD_RR_RI(U2,Chi_12,UChi_12,UChi_12)				\
+	 VMADD_MII_IR(U0,Chi_02,UChi_00,UChi_00)			\
+	 VMADD_MII_IR(U1,Chi_02,UChi_01,UChi_01)			\
+	 VMADD_MII_IR(U2,Chi_02,UChi_02,UChi_02)			\
+	 VMADD_MII_IR(U0,Chi_12,UChi_10,UChi_10)			\
+	 VMADD_MII_IR(U1,Chi_12,UChi_11,UChi_11)			\
+	 VMADD_MII_IR(U2,Chi_12,UChi_12,UChi_12)			\
+	 : : "r" (32*2), "r" (32*5), "r" (32*8), "r" (ub ));		\
+  }
+
+#define SAVE_RESULT(base,basep) {\
+    uint64_t ub = ((uint64_t)base)  - 32;				\
+    asm("mr %0,"REP";\n\t"						\
+	"li " IMM ",32;\n\t"						\
+	VSTORE(IMM,REP,psi_00)						\
+	VSTORE(IMM,REP,psi_01)						\
+	VSTORE(IMM,REP,psi_02)						\
+	VSTORE(IMM,REP,psi_10)						\
+	VSTORE(IMM,REP,psi_11)						\
+	VSTORE(IMM,REP,psi_12)						\
+	VSTORE(IMM,REP,psi_20)						\
+	VSTORE(IMM,REP,psi_21)						\
+	VSTORE(IMM,REP,psi_22)						\
+	VSTORE(IMM,REP,psi_30)						\
+	VSTORE(IMM,REP,psi_31)						\
+	VSTORE(IMM,REP,psi_32)						\
+	);								\
+}
+
+/*
+ *Annoying BG/Q loads with no immediat indexing and big performance hit
+ *when second miss to a L1 line occurs
+ */
+#define LOAD_CHI(base) {						\
+    uint64_t ub = ((uint64_t)base)  - 64;				\
+    asm("mr  %0,"REP";\n\t"						\
+	"li " IMM ",64;\n\t"				    		\
+	VLOAD(IMM,REP,Chi_00)						\
+	VLOAD(IMM,REP,Chi_02)						\
+	VLOAD(IMM,REP,Chi_11) : : "r" (ub) 	     );			\
+    ub = ((uint64_t)base)  - 32;					\
+    asm("mr  %0,"REP";\n\t"						\
+	"li IMM,64;\n\t"				    		\
+	VLOAD(IMM,REP,Chimu_01)						\
+	VLOAD(IMM,REP,Chimu_10)						\
+	VLOAD(IMM,REP,Chimu_12)	: : "r" (ub) 	     );			\
+  }
+
+#define LOAD_CHIMU(base) {						\
+    uint64_t ub = ((uint64_t)base)  - 64;				\
+    asm("mr  %0,"REP";\n\t"						\
+	"li IMM,64;\n\t"				    		\
+	VLOAD(IMM,REP,Chimu_00)						\
+	VLOAD(IMM,REP,Chimu_02)						\
+	VLOAD(IMM,REP,Chimu_11)						\
+	VLOAD(IMM,REP,Chimu_20)						\
+	VLOAD(IMM,REP,Chimu_22)						\
+	VLOAD(IMM,REP,Chimu_31) : : "r" (ub) 	     );			\
+    ub = ((uint64_t)base)  - 32;					\
+    asm("mr  %0,"REP";\n\t"						\
+	"li IMM,64;\n\t"				    		\
+	VLOAD(IMM,REP,Chimu_01)						\
+	VLOAD(IMM,REP,Chimu_10)						\
+	VLOAD(IMM,REP,Chimu_12)						\
+	VLOAD(IMM,REP,Chimu_21)						\
+	VLOAD(IMM,REP,Chimu_30)						\
+	VLOAD(IMM,REP,Chimu_32)	: : "r" (ub) 	     );			\
+  }
+
+//      hspin(0)=fspin(0)+timesI(fspin(3));
+//      hspin(1)=fspin(1)+timesI(fspin(2));
+#define XP_PROJMEM(base) {					\
+    LOAD_CHIMU(base);						\
+    asm (							\
+         VONE(one)						\
+	 VMADD_MII_IR(one,Chimu_30,Chimu_00,Chi_00)		\
+	 VMADD_MII_IR(one,Chimu_31,Chimu_01,Chi_01)		\
+	 VMADD_MII_IR(one,Chimu_32,Chimu_02,Chi_02)		\
+	 VMADD_MII_IR(one,Chimu_20,Chimu_10,Chi_10)		\
+	 VMADD_MII_IR(one,Chimu_21,Chimu_11,Chi_11)		\
+	 VMADD_MII_IR(one,Chimu_22,Chimu_12,Chi_12)		\
+							);	\
+  }
+
+#define XM_PROJMEM(base) {				\
+    LOAD_CHIMU(base);					\
+    asm (						\
+         VONE(one)						\
+	 VMADD_II_MIR(one,Chimu_30,Chimu_00,Chi_00)	\
+	 VMADD_II_MIR(one,Chimu_31,Chimu_01,Chi_01)	\
+	 VMADD_II_MIR(one,Chimu_32,Chimu_02,Chi_02)	\
+	 VMADD_II_MIR(one,Chimu_20,Chimu_10,Chi_10)	\
+	 VMADD_II_MIR(one,Chimu_21,Chimu_11,Chi_11)	\
+	 VMADD_II_MIR(one,Chimu_22,Chimu_12,Chi_12)	\
+							);	\
+  }
+
+//      hspin(0)=fspin(0)-fspin(3);
+//      hspin(1)=fspin(1)+fspin(2);
+#define YP_PROJMEM(base) {  \
+    LOAD_CHIMU(base);						\
+    asm (							\
+	 VSUB(Chimu_00,Chimu_00,Chi_30)		\
+	 VSUB(Chimu_01,Chimu_01,Chi_31)		\
+	 VSUB(Chimu_02,Chimu_02,Chi_32)		\
+	 VADD(Chimu_10,Chimu_10,Chi_20)		\
+	 VADD(Chimu_11,Chimu_11,Chi_21)		\
+	 VADD(Chimu_12,Chimu_12,Chi_22)		\
+							);	\
+  }
+
+#define YM_PROJMEM(base) {			\
+    LOAD_CHIMU(base);						\
+    asm (							\
+	 VADD(Chimu_00,Chimu_00,Chi_30)		\
+	 VADD(Chimu_01,Chimu_01,Chi_31)		\
+	 VADD(Chimu_02,Chimu_02,Chi_32)		\
+	 VSUB(Chimu_10,Chimu_10,Chi_20)		\
+	 VSUB(Chimu_11,Chimu_11,Chi_21)		\
+	 VSUB(Chimu_12,Chimu_12,Chi_22)		\
+							);	\
+  }
+
+	    /*Gz
+	     *  0 0  i  0   [0]+-i[2]
+	     *  0 0  0 -i   [1]-+i[3]
+	     * -i 0  0  0
+	     *  0 i  0  0
+	     */
+#define ZP_PROJMEM(base) {  \
+    LOAD_CHIMU(base);						\
+    asm (							\
+         VONE(one)						\
+	 VMADD_MII_IR(one,Chimu_20,Chimu_00,Chi_00)		\
+	 VMADD_MII_IR(one,Chimu_21,Chimu_01,Chi_01)		\
+	 VMADD_MII_IR(one,Chimu_22,Chimu_02,Chi_02)		\
+	 VMADD_II_MIR(one,Chimu_30,Chimu_10,Chi_10)		\
+	 VMADD_II_MIR(one,Chimu_31,Chimu_11,Chi_11)		\
+	 VMADD_II_MIR(one,Chimu_32,Chimu_12,Chi_12)		\
+							);	\
+  }
+
+#define ZM_PROJMEM(base) {  \
+    LOAD_CHIMU(base);						\
+    asm (							\
+         VONE(one)						\
+	 VMADD_II_MIR(one,Chimu_20,Chimu_00,Chi_00)		\
+	 VMADD_II_MIR(one,Chimu_21,Chimu_01,Chi_01)		\
+	 VMADD_II_MIR(one,Chimu_22,Chimu_02,Chi_02)		\
+	 VMADD_MII_IR(one,Chimu_30,Chimu_10,Chi_10)		\
+	 VMADD_MII_IR(one,Chimu_31,Chimu_11,Chi_11)		\
+	 VMADD_MII_IR(one,Chimu_32,Chimu_12,Chi_12)		\
+							);	\
+  }
+	    /*Gt
+	     *  0 0  1  0 [0]+-[2]
+	     *  0 0  0  1 [1]+-[3]
+	     *  1 0  0  0
+	     *  0 1  0  0
+	     */
+#define TP_PROJMEM(base) {  \
+    LOAD_CHIMU(base);						\
+    asm (							\
+	 VADD(Chimu_00,Chimu_00,Chi_20)		\
+	 VADD(Chimu_01,Chimu_01,Chi_21)		\
+	 VADD(Chimu_02,Chimu_02,Chi_22)		\
+	 VADD(Chimu_10,Chimu_10,Chi_30)		\
+	 VADD(Chimu_11,Chimu_11,Chi_31)		\
+	 VADD(Chimu_12,Chimu_12,Chi_32)		\
+							);	\
+  }
+
+#define TM_PROJMEM(base) {  \
+    LOAD_CHIMU(base);						\
+    asm (							\
+	 VSUB(Chimu_00,Chimu_00,Chi_20)		\
+	 VSUB(Chimu_01,Chimu_01,Chi_21)		\
+	 VSUB(Chimu_02,Chimu_02,Chi_22)		\
+	 VSUB(Chimu_10,Chimu_10,Chi_30)		\
+	 VSUB(Chimu_11,Chimu_11,Chi_31)		\
+	 VSUB(Chimu_12,Chimu_12,Chi_32)		\
+							);	\
+  }
+
+/*
+      fspin(0)=hspin(0);
+      fspin(1)=hspin(1);
+      fspin(2)=timesMinusI(hspin(1));
+      fspin(3)=timesMinusI(hspin(0));
+
+      fspin(0)+=hspin(0);
+      fspin(1)+=hspin(1);
+      fspin(2)-=timesI(hspin(1));
+      fspin(3)-=timesI(hspin(0));
+ */
+#define XP_RECON {				\
+    asm(\
+	VONE(one)\
+	VMOV(psi_00,UChi_00) 	VMOV(psi_01,UChi_01)	VMOV(psi_02,UChi_02)\
+	VMOV(psi_10,UChi_10) 	VMOV(psi_11,UChi_11)	VMOV(psi_12,UChi_12)\
+	VZERO(psi_20)	VZERO(psi_21)	VZERO(psi_22) \
+	VZERO(psi_30) 	VZERO(psi_31)   VZERO(psi_32) \
+	VMADD_II_MIR(one,UChi_10,psi_20,psi_20)	      \
+	VMADD_II_MIR(one,UChi_11,psi_21,psi_21)	      \
+	VMADD_II_MIR(one,UChi_12,psi_22,psi_22)	      \
+	VMADD_II_MIR(one,UChi_00,psi_30,psi_30)	      \
+	VMADD_II_MIR(one,UChi_01,psi_31,psi_31)	      \
+	VMADD_II_MIR(one,UChi_02,psi_32,psi_32)	      \
+	);		     \
+  }
+
+#define XM_RECON {				\
+    asm(\
+	VONE(one)\
+	VMOV(psi_00,UChi_00) 	VMOV(psi_01,UChi_01)	VMOV(psi_02,UChi_02)\
+	VMOV(psi_10,UChi_10) 	VMOV(psi_11,UChi_11)	VMOV(psi_12,UChi_12)\
+	VZERO(psi_20)	VZERO(psi_21)	VZERO(psi_22) \
+	VZERO(psi_30) 	VZERO(psi_31)   VZERO(psi_32) \
+	VMADD_MII_IR(one,UChi_10,psi_20,psi_20)	      \
+	VMADD_MII_IR(one,UChi_11,psi_21,psi_21)	      \
+	VMADD_MII_IR(one,UChi_12,psi_22,psi_22)	      \
+	VMADD_MII_IR(one,UChi_00,psi_30,psi_30)	      \
+	VMADD_MII_IR(one,UChi_01,psi_31,psi_31)	      \
+	VMADD_MII_IR(one,UChi_02,psi_32,psi_32)	      \
+	);		     \
+  }
+
+#define XP_RECON_ACCUM {				\
+    asm(\
+	VONE(one)\
+	VADD(psi_00,UChi_00,psi_00) 	VADD(psi_01,UChi_01,psi_01)	VADD(psi_02,UChi_02,psi_02) \
+	VADD(psi_10,UChi_10,psi_10) 	VADD(psi_11,UChi_11,psi_11)	VADD(psi_12,UChi_12,psi_12) \
+	VMADD_II_MIR(one,UChi_10,psi_20,psi_20)	      \
+	VMADD_II_MIR(one,UChi_11,psi_21,psi_21)	      \
+	VMADD_II_MIR(one,UChi_12,psi_22,psi_22)	      \
+	VMADD_II_MIR(one,UChi_00,psi_30,psi_30)	      \
+	VMADD_II_MIR(one,UChi_01,psi_31,psi_31)	      \
+	VMADD_II_MIR(one,UChi_02,psi_32,psi_32)	      \
+	);		     \
+  }
+
+#define XM_RECON_ACCUM {				\
+    asm(\
+	VONE(one)\
+	VADD(psi_00,UChi_00,psi_00) 	VADD(psi_01,UChi_01,psi_01)	VADD(psi_02,UChi_02,psi_02) \
+	VADD(psi_10,UChi_10,psi_10) 	VADD(psi_11,UChi_11,psi_11)	VADD(psi_12,UChi_12,psi_12) \
+	VMADD_MII_IR(one,UChi_10,psi_20,psi_20)	      \
+	VMADD_MII_IR(one,UChi_11,psi_21,psi_21)	      \
+	VMADD_MII_IR(one,UChi_12,psi_22,psi_22)	      \
+	VMADD_MII_IR(one,UChi_00,psi_30,psi_30)	      \
+	VMADD_MII_IR(one,UChi_01,psi_31,psi_31)	      \
+	VMADD_MII_IR(one,UChi_02,psi_32,psi_32)	      \
+	);		     \
+  }
+
+//      fspin(2)+=hspin(1);
+//      fspin(3)-=hspin(0);
+#define YP_RECON_ACCUM {\
+    asm(\
+	VADD(psi_00,UChi_00,psi_00) 	VADD(psi_01,UChi_01,psi_01)	VADD(psi_02,UChi_02,psi_02) \
+	VADD(psi_10,UChi_10,psi_10) 	VADD(psi_11,UChi_11,psi_11)	VADD(psi_12,UChi_12,psi_12) \
+	VADD(psi_20,UChi_10,psi_20) 	VADD(psi_21,UChi_11,psi_21)	VADD(psi_22,UChi_12,psi_22) \
+	VSUB(psi_30,UChi_00,psi_30) 	VSUB(psi_31,UChi_01,psi_31)	VSUB(psi_32,UChi_02,psi_32) \
+	);\
+ }
+#define YM_RECON_ACCUM {\
+    asm(\
+	VADD(psi_00,UChi_00,psi_00) 	VADD(psi_01,UChi_01,psi_01)	VADD(psi_02,UChi_02,psi_02) \
+	VADD(psi_10,UChi_10,psi_10) 	VADD(psi_11,UChi_11,psi_11)	VADD(psi_12,UChi_12,psi_12) \
+	VSUB(psi_20,UChi_10,psi_20) 	VSUB(psi_21,UChi_11,psi_21)	VSUB(psi_22,UChi_12,psi_22) \
+	VADD(psi_30,UChi_00,psi_30) 	VADD(psi_31,UChi_01,psi_31)	VADD(psi_32,UChi_02,psi_32) \
+	);\
+ }
+
+//      fspin(2)-=timesI(hspin(0));
+//      fspin(3)+=timesI(hspin(1));
+#define ZP_RECON_ACCUM {\
+    asm(\
+	VONE(one)\
+	VADD(psi_00,UChi_00,psi_00) 	VADD(psi_01,UChi_01,psi_01)	VADD(psi_02,UChi_02,psi_02) \
+	VADD(psi_10,UChi_10,psi_10) 	VADD(psi_11,UChi_11,psi_11)	VADD(psi_12,UChi_12,psi_12) \
+	VMADD_II_MIR(one,UChi_00,psi_20,psi_20)	      \
+	VMADD_II_MIR(one,UChi_01,psi_21,psi_21)	      \
+	VMADD_II_MIR(one,UChi_02,psi_22,psi_22)	      \
+	VMADD_MII_IR(one,UChi_10,psi_30,psi_30)	      \
+	VMADD_MII_IR(one,UChi_11,psi_31,psi_31)	      \
+	VMADD_MII_IR(one,UChi_12,psi_32,psi_32)	      \
+	);\
+ }
+
+#define ZM_RECON_ACCUM {\
+    asm(\
+	VONE(one)\
+	VADD(psi_00,UChi_00,psi_00) 	VADD(psi_01,UChi_01,psi_01)	VADD(psi_02,UChi_02,psi_02) \
+	VADD(psi_10,UChi_10,psi_10) 	VADD(psi_11,UChi_11,psi_11)	VADD(psi_12,UChi_12,psi_12) \
+	VMADD_MII_IR(one,UChi_00,psi_20,psi_20)	      \
+	VMADD_MII_IR(one,UChi_01,psi_21,psi_21)	      \
+	VMADD_MII_IR(one,UChi_02,psi_22,psi_22)	      \
+	VMADD_II_MIR(one,UChi_10,psi_30,psi_30)	      \
+	VMADD_II_MIR(one,UChi_11,psi_31,psi_31)	      \
+	VMADD_II_MIR(one,UChi_12,psi_32,psi_32)	      \
+	);\
+ }
+
+//      fspin(2)+=hspin(0);
+//      fspin(3)+=hspin(1);
+#define TP_RECON_ACCUM {\
+    asm(\
+	VADD(psi_00,UChi_00,psi_00) 	VADD(psi_01,UChi_01,psi_01)	VADD(psi_02,UChi_02,psi_02) \
+	VADD(psi_10,UChi_10,psi_10) 	VADD(psi_11,UChi_11,psi_11)	VADD(psi_12,UChi_12,psi_12) \
+	VADD(psi_20,UChi_00,psi_20) 	VADD(psi_21,UChi_01,psi_21)	VADD(psi_22,UChi_02,psi_22) \
+	VADD(psi_30,UChi_10,psi_30) 	VADD(psi_31,UChi_11,psi_31)	VADD(psi_32,UChi_12,psi_32) \
+	);\
+ }
+
+#define TM_RECON_ACCUM {\
+    asm(\
+	VONE(one)\
+	VADD(psi_00,UChi_00,psi_00) 	VADD(psi_01,UChi_01,psi_01)	VADD(psi_02,UChi_02,psi_02) \
+	VADD(psi_10,UChi_10,psi_10) 	VADD(psi_11,UChi_11,psi_11)	VADD(psi_12,UChi_12,psi_12) \
+	VSUB(psi_20,UChi_00,psi_20) 	VSUB(psi_21,UChi_01,psi_21)	VSUB(psi_22,UChi_02,psi_22) \
+	VSUB(psi_30,UChi_10,psi_30) 	VSUB(psi_31,UChi_11,psi_31)	VSUB(psi_32,UChi_12,psi_32) \
+	);\
+ }
+
+uint64_t GetPFInfo(int nent,int plocal);
+uint64_t GetInfo(int ptype,int local,int perm,int Xp,int ent,int plocal); 
+
+#define COMPLEX_TYPE int; 
+int signs[4];
+
+void testme(int osites,int ssU)
+{
+  int local,perm, ptype;
+  uint64_t base;
+  uint64_t basep;
+  const uint64_t plocal =(uint64_t) & in._odata[0];
+
+  //  vComplexF isigns[2] = { signs[0], signs[1] };
+  //COMPLEX_TYPE is vComplexF of vComplexD depending 
+  //on the chosen precision
+  COMPLEX_TYPE *isigns = &signs[0];
+
+  MASK_REGS;
+  int nmax=osites;
+  for(int site=0;site<Ns;site++) {
+    int sU =ssU;
+  int ssn=ssU+1; 
+  if(ssn>=nmax) ssn=0;
+  int sUn=ssn;
+  for(int s=0;s<Ls;s++) {
+  ss =sU*Ls+s;
+  ssn=sUn*Ls+s; 
+  ////////////////////////////////
+  // Xp
+  ////////////////////////////////
+  int  ent=ss*8;// 2*Ndim
+  int nent=ssn*8;
+
+  PF_GAUGE(Xp); 
+  base  = GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
+  PREFETCH1_CHIMU(base);
+
+  basep = GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
+    LOAD64(%r10,isigns);
+#ifdef KERNEL_DAG
+    XP_PROJMEM(base);
+#else 
+    XM_PROJMEM(base);
+#endif
+    MAYBEPERM(PERMUTE_DIR3,perm);
+  } else { 
+    LOAD_CHI(base);
+  }
+  base = GetInfo(ptype,local,perm,Yp,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
+  {
+    MULT_2SPIN_DIR_PFXP(Xp,basep);
+  }
+  LOAD64(%r10,isigns);
+#ifdef KERNEL_DAG
+  XP_RECON;
+#else
+  XM_RECON;
+#endif
+  ////////////////////////////////
+  // Yp
+  ////////////////////////////////
+  basep = GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    YP_PROJMEM(base);
+#else
+    YM_PROJMEM(base);
+#endif
+    MAYBEPERM(PERMUTE_DIR2,perm);
+  } else { 
+    LOAD_CHI(base);
+  }
+  base  = GetInfo(ptype,local,perm,Zp,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
+  {
+    MULT_2SPIN_DIR_PFYP(Yp,basep);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  YP_RECON_ACCUM;
+#else
+  YM_RECON_ACCUM;
+#endif
+
+  ////////////////////////////////
+  // Zp
+  ////////////////////////////////
+  basep = GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    ZP_PROJMEM(base);
+#else
+    ZM_PROJMEM(base);
+#endif
+    MAYBEPERM(PERMUTE_DIR1,perm);
+  } else { 
+    LOAD_CHI(base);
+  }
+  base  = GetInfo(ptype,local,perm,Tp,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
+  {
+    MULT_2SPIN_DIR_PFZP(Zp,basep);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  ZP_RECON_ACCUM;
+#else
+  ZM_RECON_ACCUM;
+#endif
+
+  ////////////////////////////////
+  // Tp
+  ////////////////////////////////
+  basep = GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    TP_PROJMEM(base);
+#else
+    TM_PROJMEM(base);
+#endif
+    MAYBEPERM(PERMUTE_DIR0,perm);
+  } else { 
+    LOAD_CHI(base);
+  }
+  base = GetInfo(ptype,local,perm,Xm,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
+  {
+    MULT_2SPIN_DIR_PFTP(Tp,basep);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  TP_RECON_ACCUM;
+#else
+  TM_RECON_ACCUM;
+#endif
+
+  ////////////////////////////////
+  // Xm
+  ////////////////////////////////
+#ifndef STREAM_STORE
+  basep= (uint64_t) &out._odata[ss];
+#endif
+  //  basep= GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    XM_PROJMEM(base);
+#else
+    XP_PROJMEM(base);
+#endif
+    MAYBEPERM(PERMUTE_DIR3,perm);
+  } else { 
+    LOAD_CHI(base);
+  }
+  base = GetInfo(ptype,local,perm,Ym,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
+  {
+    MULT_2SPIN_DIR_PFXM(Xm,basep);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  XM_RECON_ACCUM;
+#else
+  XP_RECON_ACCUM;
+#endif
+
+  ////////////////////////////////
+  // Ym
+  ////////////////////////////////
+  basep= GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    YM_PROJMEM(base);
+#else
+    YP_PROJMEM(base);
+#endif
+    MAYBEPERM(PERMUTE_DIR2,perm);
+  } else { 
+    LOAD_CHI(base);
+  }
+  base = GetInfo(ptype,local,perm,Zm,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
+  {
+    MULT_2SPIN_DIR_PFYM(Ym,basep);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  YM_RECON_ACCUM;
+#else
+  YP_RECON_ACCUM;
+#endif
+
+  ////////////////////////////////
+  // Zm
+  ////////////////////////////////
+  basep= GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    ZM_PROJMEM(base);
+#else
+    ZP_PROJMEM(base);
+#endif
+    MAYBEPERM(PERMUTE_DIR1,perm);
+  } else { 
+    LOAD_CHI(base);
+  }
+  base = GetInfo(ptype,local,perm,Tm,ent,plocal); ent++;
+  PREFETCH_CHIMU(base);
+  {
+    MULT_2SPIN_DIR_PFZM(Zm,basep);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  ZM_RECON_ACCUM;
+#else
+  ZP_RECON_ACCUM;
+#endif
+
+  ////////////////////////////////
+  // Tm
+  ////////////////////////////////
+  basep= GetPFInfo(nent,plocal); nent++;
+  if ( local ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    TM_PROJMEM(base);
+#else
+    TP_PROJMEM(base);
+#endif
+    MAYBEPERM(PERMUTE_DIR0,perm);
+  } else { 
+    LOAD_CHI(base);
+  }
+  base= (uint64_t) &out._odata[ss];
+#ifndef STREAM_STORE
+  PREFETCH_CHIMU(base);
+#endif
+  {
+    MULT_2SPIN_DIR_PFTM(Tm,basep);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  TM_RECON_ACCUM;
+#else
+  TP_RECON_ACCUM;
+#endif
+
+  basep= GetPFInfo(nent,plocal); nent++;
+  SAVE_RESULT(base,basep);
+  
+  }
+  ssU++;
+  }
+}
+
+
+#endif
--- a/Grid/simd/Grid_avx.h
+++ b/Grid/simd/Grid_avx.h
@@ -0,0 +1,769 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/simd/Grid_avx.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <immintrin.h>
+#ifdef AVXFMA4
+#include <x86intrin.h>
+#endif
+// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
+#ifndef _mm256_set_m128i
+#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
+#endif
+
+namespace Grid {
+namespace Optimization {
+
+  template<class vtype>
+  union uconv {
+    __m256 f;
+    vtype v;
+  };
+
+  union u256f {
+    __m256 v;
+    float f[8];
+  };
+
+  union u256d {
+    __m256d v;
+    double f[4];
+  };
+
+ struct Vsplat{
+    // Complex float
+    inline __m256 operator()(float a, float b) {
+      return _mm256_set_ps(b,a,b,a,b,a,b,a);
+    }
+    // Real float
+    inline __m256 operator()(float a){
+      return _mm256_set_ps(a,a,a,a,a,a,a,a);
+    }
+    //Complex double
+    inline __m256d operator()(double a, double b){
+      return _mm256_set_pd(b,a,b,a);
+    }
+    //Real double
+    inline __m256d operator()(double a){
+      return _mm256_set_pd(a,a,a,a);
+    }
+    //Integer
+    inline __m256i operator()(Integer a){
+      return _mm256_set1_epi32(a);
+    }
+  };
+
+  struct Vstore{
+    //Float
+    inline void operator()(__m256 a, float* F){
+      _mm256_store_ps(F,a);
+    }
+    //Double
+    inline void operator()(__m256d a, double* D){
+      _mm256_store_pd(D,a);
+    }
+    //Integer
+    inline void operator()(__m256i a, Integer* I){
+      _mm256_store_si256((__m256i*)I,a);
+    }
+
+  };
+
+  struct Vstream{
+    //Float
+    inline void operator()(float * a, __m256 b){
+      _mm256_stream_ps(a,b);
+    }
+    //Double
+    inline void operator()(double * a, __m256d b){
+      _mm256_stream_pd(a,b);
+    }
+
+
+  };
+
+  struct Vset{
+    // Complex float
+    inline __m256 operator()(Grid::ComplexF *a){
+      return _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Complex double
+    inline __m256d operator()(Grid::ComplexD *a){
+      return _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Real float
+    inline __m256 operator()(float *a){
+      return _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+    // Real double
+    inline __m256d operator()(double *a){
+      return _mm256_set_pd(a[3],a[2],a[1],a[0]);
+    }
+    // Integer
+    inline __m256i operator()(Integer *a){
+      return _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+
+  };
+
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    // Need templated class to overload output type
+    // General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline __m256 operator()(__m256 a, __m256 b){
+      return _mm256_add_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m256d operator()(__m256d a, __m256d b){
+      return _mm256_add_pd(a,b);
+    }
+    //Integer
+    inline __m256i operator()(__m256i a, __m256i b){
+#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
+          __m128i a0,a1;
+          __m128i b0,b1;
+          a0 = _mm256_extractf128_si256(a,0);
+          b0 = _mm256_extractf128_si256(b,0);
+          a1 = _mm256_extractf128_si256(a,1);
+          b1 = _mm256_extractf128_si256(b,1);
+          a0 = _mm_add_epi32(a0,b0);
+          a1 = _mm_add_epi32(a1,b1);
+          return _mm256_set_m128i(a1,a0);
+#endif
+#if defined (AVX2)
+            return _mm256_add_epi32(a,b);
+#endif
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline __m256 operator()(__m256 a, __m256 b){
+      return _mm256_sub_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m256d operator()(__m256d a, __m256d b){
+      return _mm256_sub_pd(a,b);
+    }
+    //Integer
+    inline __m256i operator()(__m256i a, __m256i b){
+#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
+          __m128i a0,a1;
+          __m128i b0,b1;
+          a0 = _mm256_extractf128_si256(a,0);
+          b0 = _mm256_extractf128_si256(b,0);
+          a1 = _mm256_extractf128_si256(a,1);
+          b1 = _mm256_extractf128_si256(b,1);
+          a0 = _mm_sub_epi32(a0,b0);
+          a1 = _mm_sub_epi32(a1,b1);
+          return _mm256_set_m128i(a1,a0);
+#endif
+#if defined (AVX2)
+            return _mm256_sub_epi32(a,b);
+#endif
+
+    }
+  };
+
+  struct MultRealPart{
+    inline __m256 operator()(__m256 a, __m256 b){
+      __m256 ymm0;
+      ymm0  = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
+      return  _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
+    }
+    inline __m256d operator()(__m256d a, __m256d b){
+      __m256d ymm0;
+      ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
+      return _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
+    }
+  };
+  struct MaddRealPart{
+    inline __m256 operator()(__m256 a, __m256 b, __m256 c){
+      __m256 ymm0 =  _mm256_moveldup_ps(a); // ymm0 <- ar ar,
+      return _mm256_add_ps(_mm256_mul_ps( ymm0, b),c);                         
+    }
+    inline __m256d operator()(__m256d a, __m256d b, __m256d c){
+      __m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 );
+      return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c);                         
+    }
+  };
+
+  struct MultComplex{
+    // Complex float
+    inline __m256 operator()(__m256 a, __m256 b){
+#if defined (AVX1)
+      __m256 ymm0,ymm1,ymm2;
+      ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
+      ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
+      // FIXME AVX2 could MAC
+      ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
+      ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
+      ymm1 = _mm256_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
+      return _mm256_addsub_ps(ymm0,ymm1);
+#endif
+#if defined (AVXFMA4)
+      __m256 a_real = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ar ar,
+      __m256 a_imag = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ai ai
+      __m256 tmp = _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1));
+      a_imag = _mm256_mul_ps( a_imag,tmp  );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
+      return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
+#endif
+#if defined (AVX2)  || defined (AVXFMA)
+      __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
+      __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
+      a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ));  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
+      return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
+#endif
+    }
+    // Complex double
+    inline __m256d operator()(__m256d a, __m256d b) {
+      // Multiplication of (ak+ibk)*(ck+idk)
+      // a + i b can be stored as a data structure
+      // From intel optimisation reference guide
+      /*
+	movsldup xmm0, Src1; load real parts into the destination,
+	; a1, a1, a0, a0
+	movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
+	mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
+	shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
+	movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
+	mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
+	addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
+	VSHUFPD (VEX.256 encoded version)
+	IF IMM0[0] = 0
+	THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
+	IF IMM0[1] = 0
+	THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
+	IF IMM0[2] = 0
+	THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
+	IF IMM0[3] = 0
+	THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i   ; 0xC unchanged
+      */
+#if defined (AVX1)
+      __m256d ymm0,ymm1,ymm2;
+      ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
+      ymm0 = _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
+      ymm1 = _mm256_shuffle_pd(b,b,0x5); // ymm1 <- br,bi  b'01,01
+      ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai  b'11,11
+      ymm1 = _mm256_mul_pd(ymm1,ymm2);   // ymm1 <- br ai, ai bi
+      return _mm256_addsub_pd(ymm0,ymm1);
+#endif
+#if defined (AVXFMA4)
+      __m256d a_real = _mm256_shuffle_pd(a,a,0x0);//arar
+      __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
+      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
+      return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
+#endif
+#if defined (AVX2) || defined (AVXFMA)
+      __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
+      __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
+      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
+      return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
+#endif
+    }
+
+
+  };
+
+#if 0
+  struct ComplexDot {
+
+    inline void Prep(__m256 ari,__m256 &air) {
+      cdotRIperm(ari,air);
+    }
+    inline void Mul(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
+      riir=air*b;
+      iirr=arr*b;
+    };
+    inline void Madd(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
+      mac(riir,air,b);
+      mac(iirr,ari,b);
+    }
+    inline void End(__m256 ari,__m256 &air) {
+      //      cdotRI
+    }
+
+  };
+#endif
+
+  struct Mult{
+
+    inline void mac(__m256 &a, __m256 b, __m256 c){
+#if defined (AVX1)
+      a= _mm256_add_ps(_mm256_mul_ps(b,c),a);
+#endif
+#if defined (AVXFMA4)
+      a= _mm256_macc_ps(b,c,a);
+#endif
+#if defined (AVX2) || defined (AVXFMA)
+      a= _mm256_fmadd_ps( b, c, a);
+#endif
+    }
+
+    inline void mac(__m256d &a, __m256d b, __m256d c){
+#if defined (AVX1)
+      a= _mm256_add_pd(_mm256_mul_pd(b,c),a);
+#endif
+#if defined (AVXFMA4)
+      a= _mm256_macc_pd(b,c,a);
+#endif
+#if defined (AVX2) || defined (AVXFMA)
+      a= _mm256_fmadd_pd( b, c, a);
+#endif
+    }
+
+    // Real float
+    inline __m256 operator()(__m256 a, __m256 b){
+      return _mm256_mul_ps(a,b);
+    }
+    // Real double
+    inline __m256d operator()(__m256d a, __m256d b){
+      return _mm256_mul_pd(a,b);
+    }
+    // Integer
+    inline __m256i operator()(__m256i a, __m256i b){
+#if defined (AVX1) || defined (AVXFMA)
+      __m128i a0,a1;
+      __m128i b0,b1;
+      a0 = _mm256_extractf128_si256(a,0);
+      b0 = _mm256_extractf128_si256(b,0);
+      a1 = _mm256_extractf128_si256(a,1);
+      b1 = _mm256_extractf128_si256(b,1);
+      a0 = _mm_mullo_epi32(a0,b0);
+      a1 = _mm_mullo_epi32(a1,b1);
+      return _mm256_set_m128i(a1,a0);
+#endif
+#if defined (AVX2)
+      return _mm256_mullo_epi32(a,b);
+#endif
+
+    }
+  };
+
+  struct Div {
+    // Real float
+    inline __m256 operator()(__m256 a, __m256 b) {
+      return _mm256_div_ps(a, b);
+    }
+    // Real double
+    inline __m256d operator()(__m256d a, __m256d b){
+      return _mm256_div_pd(a,b);
+    }
+  };
+
+
+  struct Conj{
+    // Complex single
+    inline __m256 operator()(__m256 in){
+      return _mm256_xor_ps(_mm256_addsub_ps(_mm256_setzero_ps(),in), _mm256_set1_ps(-0.f));
+    }
+    // Complex double
+    inline __m256d operator()(__m256d in){
+      return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));
+    }
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline __m256 operator()(__m256 in, __m256 ret){
+      __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in);   // r,-i
+      return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
+    }
+    //Complex double
+    inline __m256d operator()(__m256d in, __m256d ret){
+      __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i
+      return _mm256_shuffle_pd(tmp,tmp,0x5);
+    }
+  };
+
+  struct TimesI{
+    //Complex single
+    inline __m256 operator()(__m256 in, __m256 ret){
+      __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
+      return _mm256_addsub_ps(_mm256_setzero_ps(),tmp);          // i,-r
+    }
+    //Complex double
+    inline __m256d operator()(__m256d in, __m256d ret){
+      __m256d tmp = _mm256_shuffle_pd(in,in,0x5);
+      return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r
+    }
+  };
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+  //////////////////////////////////////////////
+
+  struct Permute{
+
+    static inline __m256 Permute0(__m256 in){
+      return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD
+    };
+    static inline __m256 Permute1(__m256 in){
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF
+    };
+    static inline __m256 Permute2(__m256 in){
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG
+    };
+    static inline __m256 Permute3(__m256 in){
+      return in;
+    };
+
+    static inline __m256d Permute0(__m256d in){
+      return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB
+    };
+    static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC
+      return _mm256_shuffle_pd(in,in,0x5);
+    };
+    static inline __m256d Permute2(__m256d in){
+      return in;
+    };
+    static inline __m256d Permute3(__m256d in){
+      return in;
+    };
+  };
+#define USE_FP16
+  struct PrecisionChange {
+    static inline __m256i StoH (__m256 a,__m256 b) {
+      __m256i h;
+#ifdef USE_FP16
+      __m128i ha = _mm256_cvtps_ph(a,0);
+      __m128i hb = _mm256_cvtps_ph(b,0);
+      h =(__m256i) _mm256_castps128_ps256((__m128)ha);
+      h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1);
+#else 
+      assert(0);
+#endif
+      return h;
+    }
+    static inline void  HtoS (__m256i h,__m256 &sa,__m256 &sb) {
+#ifdef USE_FP16
+      sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0));
+      sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1));
+#else 
+      assert(0);
+#endif
+    }
+    static inline __m256 DtoS (__m256d a,__m256d b) {
+      __m128 sa = _mm256_cvtpd_ps(a);
+      __m128 sb = _mm256_cvtpd_ps(b);
+      __m256 s = _mm256_castps128_ps256(sa);
+      s = _mm256_insertf128_ps(s,sb,1);
+      return s;
+    }
+    static inline void StoD (__m256 s,__m256d &a,__m256d &b) {
+      a = _mm256_cvtps_pd(_mm256_extractf128_ps(s,0));
+      b = _mm256_cvtps_pd(_mm256_extractf128_ps(s,1));
+    }
+    static inline __m256i DtoH (__m256d a,__m256d b,__m256d c,__m256d d) {
+      __m256 sa,sb;
+      sa = DtoS(a,b);
+      sb = DtoS(c,d);
+      return StoH(sa,sb);
+    }
+    static inline void HtoD (__m256i h,__m256d &a,__m256d &b,__m256d &c,__m256d &d) {
+      __m256 sa,sb;
+      HtoS(h,sa,sb);
+      StoD(sa,a,b);
+      StoD(sb,c,d);
+    }
+  };
+  struct Exchange{
+    // 3210 ordering
+    static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
+      //Invertible
+      //AB CD ->  AC BD
+      //AC BD ->  AB CD
+      out1= _mm256_permute2f128_ps(in1,in2,0x20);
+      out2= _mm256_permute2f128_ps(in1,in2,0x31);
+    };
+    static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
+      //Invertible
+      // ABCD EFGH  ->ABEF CDGH
+      // ABEF CDGH  ->ABCD EFGH
+      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
+      // Invertible ? 
+      // ABCD EFGH -> ACEG BDFH
+      // ACEG BDFH -> AEBF CGDH
+      //      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      //      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+      // Bollocks; need 
+      // AECG BFDH -> ABCD EFGH
+      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/
+      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/
+      out1= _mm256_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
+      out2= _mm256_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
+    };
+    static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
+      assert(0);
+      return;
+    };
+
+    static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
+      out1= _mm256_permute2f128_pd(in1,in2,0x20);
+      out2= _mm256_permute2f128_pd(in1,in2,0x31);
+      return;
+    };
+    static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
+      out1= _mm256_shuffle_pd(in1,in2,0x0);
+      out2= _mm256_shuffle_pd(in1,in2,0xF);
+    };
+    static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
+      assert(0);
+      return;
+    };
+  };
+
+
+#if defined (AVX2)
+#define _mm256_alignr_epi32_grid(ret,a,b,n) ret=(__m256)  _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
+#define _mm256_alignr_epi64_grid(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
+#endif
+
+#if defined (AVX1) || defined (AVXFMA)
+#define _mm256_alignr_epi32_grid(ret,a,b,n) {	\
+    __m128 aa, bb;				\
+						\
+    aa  = _mm256_extractf128_ps(a,1);		\
+    bb  = _mm256_extractf128_ps(b,1);		\
+    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
+    ret = _mm256_insertf128_ps(ret,aa,1);	\
+						\
+    aa  = _mm256_extractf128_ps(a,0);		\
+    bb  = _mm256_extractf128_ps(b,0);		\
+    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
+    ret = _mm256_insertf128_ps(ret,aa,0);	\
+  }
+
+#define _mm256_alignr_epi64_grid(ret,a,b,n) {	\
+    __m128d aa, bb;				\
+						\
+    aa  = _mm256_extractf128_pd(a,1);		\
+    bb  = _mm256_extractf128_pd(b,1);		\
+    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
+    ret = _mm256_insertf128_pd(ret,aa,1);	\
+						\
+    aa  = _mm256_extractf128_pd(a,0);		\
+    bb  = _mm256_extractf128_pd(b,0);		\
+    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
+    ret = _mm256_insertf128_pd(ret,aa,0);	\
+  }
+
+#endif
+
+  struct Rotate{
+
+    static inline __m256 rotate(__m256 in,int n){
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m256d rotate(__m256d in,int n){
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      default: assert(0);
+      }
+    }
+
+
+    template<int n>
+    static inline __m256 tRotate(__m256 in){
+      __m256 tmp = Permute::Permute0(in);
+      __m256 ret;
+      if ( n > 3 ) {
+          _mm256_alignr_epi32_grid(ret,in,tmp,n);
+      } else {
+          _mm256_alignr_epi32_grid(ret,tmp,in,n);
+      }
+      return ret;
+    }
+
+    template<int n>
+    static inline __m256d tRotate(__m256d in){
+      __m256d tmp = Permute::Permute0(in);
+      __m256d ret;
+      if ( n > 1 ) {
+	_mm256_alignr_epi64_grid(ret,in,tmp,n);
+      } else {
+        _mm256_alignr_epi64_grid(ret,tmp,in,n);
+      }
+      return ret;
+    };
+
+  };
+
+  //Complex float Reduce
+  template<>
+    inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
+    __m256 v1,v2;
+    v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single
+    v1= _mm256_add_ps(v1,in);
+    v2=Optimization::Permute::Permute1(v1);
+    v1 = _mm256_add_ps(v1,v2);
+    u256f conv; conv.v = v1;
+    return Grid::ComplexF(conv.f[0],conv.f[1]);
+  }
+
+  //Real float Reduce
+  template<>
+  inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
+    __m256 v1,v2;
+    v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double
+    v1 = _mm256_add_ps(v1,in);
+    v2 = Optimization::Permute::Permute1(v1);
+    v1 = _mm256_add_ps(v1,v2);
+    v2 = Optimization::Permute::Permute2(v1);
+    v1 = _mm256_add_ps(v1,v2);
+    u256f conv; conv.v=v1;
+    return conv.f[0];
+  }
+
+
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
+    __m256d v1;
+    v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
+    v1 = _mm256_add_pd(v1,in);
+    u256d conv; conv.v = v1;
+    return Grid::ComplexD(conv.f[0],conv.f[1]);
+  }
+
+  //Real double Reduce
+  template<>
+  inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
+    __m256d v1,v2;
+    v1 = Optimization::Permute::Permute0(in); // avx 256; quad double
+    v1 = _mm256_add_pd(v1,in);
+    v2 = Optimization::Permute::Permute1(v1);
+    v1 = _mm256_add_pd(v1,v2);
+    u256d conv; conv.v = v1;
+    return conv.f[0];
+  }
+
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
+    __m128i ret;
+#if defined (AVX2)
+    // AVX2 horizontal adds within upper and lower halves of register; use
+    // SSE to add upper and lower halves for result.
+    __m256i v1, v2;
+    __m128i u1, u2;
+    v1  = _mm256_hadd_epi32(in, in);
+    v2  = _mm256_hadd_epi32(v1, v1);
+    u1  = _mm256_castsi256_si128(v2);      // upper half
+    u2  = _mm256_extracti128_si256(v2, 1); // lower half
+    ret = _mm_add_epi32(u1, u2);
+#else
+    // No AVX horizontal add; extract upper and lower halves of register & use
+    // SSE intrinsics.
+    __m128i u1, u2, u3;
+    u1  = _mm256_extractf128_si256(in, 0); // upper half
+    u2  = _mm256_extractf128_si256(in, 1); // lower half
+    u3  = _mm_add_epi32(u1, u2);
+    u1  = _mm_hadd_epi32(u3, u3);
+    ret = _mm_hadd_epi32(u1, u1);
+#endif
+    return _mm_cvtsi128_si32(ret);
+  }
+
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types
+
+  typedef __m256i SIMD_Htype;  // Single precision type
+  typedef __m256  SIMD_Ftype; // Single precision type
+  typedef __m256d SIMD_Dtype; // Double precision type
+  typedef __m256i SIMD_Itype; // Integer type
+
+  // prefecthing
+  inline void v_prefetch0(int size, const char *ptr){
+    for(int i=0;i<size;i+=64){ //  Define L1 linesize above
+      _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
+      _mm_prefetch(ptr+i+512,_MM_HINT_T0);
+    }
+  }
+  inline void prefetch_HINT_T0(const char *ptr){
+    _mm_prefetch(ptr, _MM_HINT_T0);
+  }
+
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S, T>;
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex  MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+
+}  // namespace Grid
--- a/Grid/simd/Grid_avx512.h
+++ b/Grid/simd/Grid_avx512.h
@@ -0,0 +1,640 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Grid_avx512.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <immintrin.h>
+
+
+namespace Grid{
+namespace Optimization {
+
+  union u512f {
+    __m512 v;
+    float f[16];
+  };
+
+  union u512d {
+    __m512d v;
+    double f[8];
+  };
+  
+  struct Vsplat{
+    //Complex float
+    inline __m512 operator()(float a, float b){
+      return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
+    }
+    // Real float
+    inline __m512 operator()(float a){
+      return _mm512_set1_ps(a);
+    }
+    //Complex double
+    inline __m512d operator()(double a, double b){
+      return _mm512_set_pd(b,a,b,a,b,a,b,a);
+    }
+    //Real double
+    inline __m512d operator()(double a){
+      return _mm512_set1_pd(a);
+    }
+    //Integer
+    inline __m512i operator()(Integer a){
+      return _mm512_set1_epi32(a);
+    }
+  };
+
+  struct Vstore{
+    //Float 
+    inline void operator()(__m512 a, float* F){
+      _mm512_store_ps(F,a);
+    }
+    //Double
+    inline void operator()(__m512d a, double* D){
+      _mm512_store_pd(D,a);
+    }
+    //Integer
+    inline void operator()(__m512i a, Integer* I){
+      _mm512_store_si512((__m512i *)I,a);
+    }
+
+  };
+
+
+  struct Vstream{
+    //Float
+    inline void operator()(float * a, __m512 b){
+      _mm512_stream_ps(a,b);
+      //      _mm512_store_ps(a,b);
+    }
+    //Double
+    inline void operator()(double * a, __m512d b){
+      _mm512_stream_pd(a,b);
+      //      _mm512_store_pd(a,b);
+    }
+
+  };
+
+
+
+  struct Vset{
+    // Complex float 
+    inline __m512 operator()(Grid::ComplexF *a){
+      return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
+			   a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
+			   a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
+			   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Complex double 
+    inline __m512d operator()(Grid::ComplexD *a){
+      return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
+			   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Real float 
+    inline __m512 operator()(float *a){
+      return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+			    a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+    // Real double
+    inline __m512d operator()(double *a){
+      return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+    // Integer
+    inline __m512i operator()(Integer *a){
+      return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+			       a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+
+
+  };
+
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+
+
+ 
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_add_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_add_pd(a,b);
+    }
+    //Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_add_epi32(a,b);
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_sub_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_sub_pd(a,b);
+    }
+    //Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_sub_epi32(a,b);
+    }
+  };
+
+  // Note, we can beat the shuf overhead in chain with two temporaries
+  // Ar Ai , Br Bi,  Ai Ar  // one shuf
+  //tmpr Ar Br,  Ai Bi    // Mul/Mac/Mac
+  //tmpi Br Ai,  Bi Ar    // Mul/Mac/Mac
+  // add tmpi,shuf(tmpi)
+  // sub tmpr,shuf(tmpi)
+  // shuf(tmpr,tmpi).    // Could drop/trade for write mask
+
+  // Gives
+  //  2mul,4 mac +add+sub = 8 flop type insns
+  //  3shuf + 2 (+shuf)   = 5/6 simd perm and 1/2 the load.
+
+  struct MultRealPart{
+    inline __m512 operator()(__m512 a, __m512 b){
+      __m512 ymm0;
+      ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
+      return _mm512_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
+    }
+    inline __m512d operator()(__m512d a, __m512d b){
+      __m512d ymm0;
+      ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00
+      return _mm512_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
+    }
+  };
+  struct MaddRealPart{
+    inline __m512 operator()(__m512 a, __m512 b, __m512 c){
+      __m512 ymm0 =  _mm512_moveldup_ps(a); // ymm0 <- ar ar,
+      return _mm512_fmadd_ps( ymm0, b, c);                         
+    }
+    inline __m512d operator()(__m512d a, __m512d b, __m512d c){
+      __m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 );
+      return _mm512_fmadd_pd( ymm0, b, c);                         
+    }
+  };
+
+  struct MultComplex{
+    // Complex float
+    inline __m512 operator()(__m512 a, __m512 b){
+      // dup, dup, perm, mul, madd
+      __m512 a_real = _mm512_moveldup_ps( a ); // Ar Ar
+      __m512 a_imag = _mm512_movehdup_ps( a ); // Ai Ai
+      a_imag = _mm512_mul_ps( a_imag, _mm512_permute_ps( b, 0xB1 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
+      return _mm512_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
+    }
+    // Complex double
+    inline __m512d operator()(__m512d a, __m512d b){
+      __m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
+      __m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
+      a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) ); 
+      return _mm512_fmaddsub_pd( a_real, b, a_imag );
+    }
+  };
+  
+  struct Mult{
+
+    inline void mac(__m512 &a, __m512 b, __m512 c){         
+       a= _mm512_fmadd_ps( b, c, a);                         
+    }
+    inline void mac(__m512d &a, __m512d b, __m512d c){
+      a= _mm512_fmadd_pd( b, c, a);                   
+    }                                             
+    // Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_mul_ps(a,b);
+    }
+    // Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_mul_pd(a,b);
+    }
+    // Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_mullo_epi32(a,b);
+    }
+  };
+
+  struct Div{
+    // Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_div_ps(a,b);
+    }
+    // Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_div_pd(a,b);
+    }
+  };
+
+
+  struct Conj{
+    // Complex single
+    inline __m512 operator()(__m512 in){
+      return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag  
+    }
+    // Complex double
+    inline __m512d operator()(__m512d in){
+      return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
+    }
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline __m512 operator()(__m512 in, __m512 ret){
+      //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
+      //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0));   // 0x4E??
+      __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
+    }
+    //Complex double
+    inline __m512d operator()(__m512d in, __m512d ret){
+      //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
+      //return _mm512_shuffle_pd(tmp,tmp,0x55);
+      __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
+      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
+    } 
+  };
+
+  struct TimesI{
+    //Complex single
+    inline __m512 operator()(__m512 in, __m512 ret){
+      __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); 
+    }
+    //Complex double
+    inline __m512d operator()(__m512d in, __m512d ret){
+      __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
+      return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); 
+    }
+
+
+  };
+
+
+  
+  // Gpermute utilities consider coalescing into 1 Gpermute
+  struct Permute{
+    
+    static inline __m512 Permute0(__m512 in){
+      return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute1(__m512 in){
+      return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512 Permute2(__m512 in){
+      return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute3(__m512 in){
+      return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+
+    static inline __m512d Permute0(__m512d in){
+      return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512d Permute1(__m512d in){
+      return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512d Permute2(__m512d in){
+      return _mm512_shuffle_pd(in,in,0x55);
+    };
+    static inline __m512d Permute3(__m512d in){
+      return in;
+    };
+
+  };
+#define USE_FP16
+  struct PrecisionChange {
+    static inline __m512i StoH (__m512 a,__m512 b) {
+      __m512i h;
+#ifdef USE_FP16
+      __m256i ha = _mm512_cvtps_ph(a,0);
+      __m256i hb = _mm512_cvtps_ph(b,0);
+      h =(__m512i) _mm512_castps256_ps512((__m256)ha);
+      h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m256d)hb,1);
+#else
+      assert(0);
+#endif
+      return h;
+    }
+    static inline void  HtoS (__m512i h,__m512 &sa,__m512 &sb) {
+#ifdef USE_FP16
+      sa = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,0));
+      sb = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,1));
+#else
+      assert(0);
+#endif
+    }
+    static inline __m512 DtoS (__m512d a,__m512d b) {
+      __m256 sa = _mm512_cvtpd_ps(a);
+      __m256 sb = _mm512_cvtpd_ps(b);
+      __m512 s = _mm512_castps256_ps512(sa);
+      s =(__m512) _mm512_insertf64x4((__m512d)s,(__m256d)sb,1);
+      return s;
+    }
+    static inline void StoD (__m512 s,__m512d &a,__m512d &b) {
+      a = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,0));
+      b = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,1));
+    }
+    static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) {
+      __m512 sa,sb;
+      sa = DtoS(a,b);
+      sb = DtoS(c,d);
+      return StoH(sa,sb);
+    }
+    static inline void HtoD (__m512i h,__m512d &a,__m512d &b,__m512d &c,__m512d &d) {
+      __m512 sa,sb;
+      HtoS(h,sa,sb);
+      StoD(sa,a,b);
+      StoD(sb,c,d);
+    }
+  };
+  // On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
+  // On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
+  // The operation is its own inverse
+  struct Exchange{
+    // 3210 ordering
+    static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
+      out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
+      out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+      out1= _mm512_shuffle_f32x4(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
+      out2= _mm512_shuffle_f32x4(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
+    };
+    static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
+      out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
+      out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+      out1= _mm512_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
+      out2= _mm512_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
+    };
+ 
+    static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
+      out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
+      out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+      out1= _mm512_shuffle_f64x2(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
+      out2= _mm512_shuffle_f64x2(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
+    };
+    static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
+      out1 = _mm512_shuffle_pd(in1,in2,0x00);
+      out2 = _mm512_shuffle_pd(in1,in2,0xFF);
+    };
+    static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
+      assert(0);
+      return;
+    };
+  };
+
+
+  struct Rotate{
+
+    static inline __m512 rotate(__m512 in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+
+      case 8 : return tRotate<8>(in);break;
+      case 9 : return tRotate<9>(in);break;
+      case 10: return tRotate<10>(in);break;
+      case 11: return tRotate<11>(in);break;
+      case 12: return tRotate<12>(in);break;
+      case 13: return tRotate<13>(in);break;
+      case 14: return tRotate<14>(in);break;
+      case 15: return tRotate<15>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m512d rotate(__m512d in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+      default: assert(0);
+      }
+    }
+
+    template<int n> static inline __m512 tRotate(__m512 in){ 
+      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
+    };
+
+    template<int n> static inline __m512d tRotate(__m512d in){ 
+      return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);          
+    };
+
+  };
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+
+  // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
+#ifndef __INTEL_COMPILER
+#warning "Slow reduction due to incomplete reduce intrinsics"
+  //Complex float Reduce
+  template<>
+    inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
+    __m512 v1,v2;
+    v1=Optimization::Permute::Permute0(in); // avx 512; quad complex single
+    v1= _mm512_add_ps(v1,in);
+    v2=Optimization::Permute::Permute1(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    v2=Optimization::Permute::Permute2(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    u512f conv; conv.v = v1;
+    return Grid::ComplexF(conv.f[0],conv.f[1]);
+  }
+  
+  //Real float Reduce
+  template<>
+    inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
+    __m512 v1,v2;
+    v1 = Optimization::Permute::Permute0(in); // avx 512; octo-double
+    v1 = _mm512_add_ps(v1,in);
+    v2 = Optimization::Permute::Permute1(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    v2 = Optimization::Permute::Permute2(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    v2 = Optimization::Permute::Permute3(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    u512f conv; conv.v=v1;
+    return conv.f[0];
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+    inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
+    __m512d v1;
+    v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
+    v1 = _mm512_add_pd(v1,in);
+    v1 = Optimization::Permute::Permute1(in); // sse 128; paired complex single
+    v1 = _mm512_add_pd(v1,in);
+    u512d conv; conv.v = v1;
+    return Grid::ComplexD(conv.f[0],conv.f[1]);
+  }
+  
+  //Real double Reduce
+  template<>
+    inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
+    __m512d v1,v2;
+    v1 = Optimization::Permute::Permute0(in); // avx 512; quad double
+    v1 = _mm512_add_pd(v1,in);
+      v2 = Optimization::Permute::Permute1(v1); 
+      v1 = _mm512_add_pd(v1,v2);
+      v2 = Optimization::Permute::Permute2(v1); 
+      v1 = _mm512_add_pd(v1,v2);
+     u512d conv; conv.v = v1;
+     return conv.f[0];
+  }
+  
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
+    // No full vector reduce, use AVX to add upper and lower halves of register
+    // and perform AVX reduction.
+    __m256i v1, v2, v3;
+    __m128i u1, u2, ret;
+    v1  = _mm512_castsi512_si256(in);       // upper half
+    v2  = _mm512_extracti32x8_epi32(in, 1); // lower half
+    v3  = _mm256_add_epi32(v1, v2);
+    v1  = _mm256_hadd_epi32(v3, v3);
+    v2  = _mm256_hadd_epi32(v1, v1);
+    u1  = _mm256_castsi256_si128(v2);        // upper half
+    u2  = _mm256_extracti128_si256(v2, 1);  // lower half
+    ret = _mm_add_epi32(u1, u2);
+    return _mm_cvtsi128_si32(ret);
+  }
+#else
+  //Complex float Reduce
+  template<>
+  inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
+    return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
+  }
+  //Real float Reduce
+  template<>
+  inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
+    return _mm512_reduce_add_ps(in);
+  }
+  
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
+    return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
+  }
+  
+  //Real double Reduce
+  template<>
+  inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
+    return _mm512_reduce_add_pd(in);
+  }
+
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
+    return _mm512_reduce_add_epi32(in);
+  }
+#endif
+  
+  
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types 
+
+
+  typedef __m512i SIMD_Htype;  // Single precision type
+  typedef __m512  SIMD_Ftype;  // Single precision type
+  typedef __m512d SIMD_Dtype; // Double precision type
+  typedef __m512i SIMD_Itype; // Integer type
+
+  // prefecth
+  inline void v_prefetch0(int size, const char *ptr){
+    for(int i=0;i<size;i+=64){ //  Define L1 linesize above
+      _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
+      _mm_prefetch(ptr+i+512,_MM_HINT_T0);
+    }
+  }
+  inline void prefetch_HINT_T0(const char *ptr){
+    _mm_prefetch(ptr,_MM_HINT_T0);
+  }
+
+
+  
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::Div         DivSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+
+}
--- a/Grid/simd/Grid_generic.h
+++ b/Grid/simd/Grid_generic.h
@@ -0,0 +1,530 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Grid_generic.h
+
+    Copyright (C) 2015
+    Copyright (C) 2017
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+        Andrew Lawson    <andrew.lawson1991@gmail.com>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include "Grid_generic_types.h"
+
+namespace Grid {
+namespace Optimization {
+  
+  struct Vsplat{
+    // Complex
+    template <typename T>
+    inline vec<T> operator()(T a, T b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::r, 2)
+      {
+        out.v[i]   = a;
+        out.v[i+1] = b;
+      }
+
+      return out;
+    }
+    
+    // Real
+    template <typename T>
+    inline vec<T> operator()(T a){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::r, 1)
+      {
+        out.v[i] = a;
+      }
+      
+      return out;
+    }
+  };
+
+  struct Vstore{
+    // Real
+    template <typename T>
+    inline void operator()(vec<T> a, T *D){
+      *((vec<T> *)D) = a;
+    }
+  };
+
+  struct Vstream{
+    // Real
+    template <typename T>
+    inline void operator()(T * a, vec<T> b){
+      *((vec<T> *)a) = b;
+    }
+  };
+
+  struct Vset{
+    // Complex
+    template <typename T>
+    inline vec<T> operator()(std::complex<T> *a){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+        out.v[2*i]   = a[i].real();
+        out.v[2*i+1] = a[i].imag();
+      }
+      
+      return out;
+    }
+    
+    // Real
+    template <typename T>
+    inline vec<T> operator()(T *a){
+      vec<T> out;
+      
+      out = *((vec<T> *)a);
+      
+      return out;
+    }
+  };
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    // Complex/Real
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::r, 1)
+      {
+        out.v[i] = a.v[i] + b.v[i];
+      }
+      
+      return out;
+    }
+  };
+
+  struct Sub{
+    // Complex/Real
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::r, 1)
+      {
+        out.v[i] = a.v[i] - b.v[i];
+      }
+      
+      return out;
+    }
+  };
+
+  struct Mult{
+    // Real
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::r, 1)
+      {
+        out.v[i] = a.v[i]*b.v[i];
+      }
+      
+      return out;
+    }
+  };
+  
+  #define cmul(a, b, c, i)\
+  c[i]   = a[i]*b[i]   - a[i+1]*b[i+1];\
+  c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
+
+  struct MultRealPart{
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+         out.v[2*i]   = a.v[2*i]*b.v[2*i];
+         out.v[2*i+1] = a.v[2*i]*b.v[2*i+1];
+      }      
+      return out;
+    }
+  };
+
+  struct MaddRealPart{
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b, vec<T> c){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+         out.v[2*i]   = a.v[2*i]*b.v[2*i] + c.v[2*i];
+         out.v[2*i+1] = a.v[2*i]*b.v[2*i+1] + c.v[2*i+1];
+      }      
+      return out;
+    }
+  };
+  
+  struct MultComplex{
+    // Complex
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+        cmul(a.v, b.v, out.v, 2*i);
+      }      
+      
+      return out;
+    }
+  };
+  
+  #undef cmul
+
+  struct Div{
+    // Real
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::r, 1)
+      {
+        out.v[i] = a.v[i]/b.v[i];
+      }
+      
+      return out;
+    }
+  };
+  
+  #define conj(a, b, i)\
+  b[i]   = a[i];\
+  b[i+1] = -a[i+1];
+  
+  struct Conj{
+    // Complex
+    template <typename T>
+    inline vec<T> operator()(vec<T> a){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+        conj(a.v, out.v, 2*i);
+      }
+      
+      return out;
+    }
+  };
+  
+  #undef conj
+
+  #define timesmi(a, b, i)\
+  b[i]   = a[i+1];\
+  b[i+1] = -a[i];
+  
+  struct TimesMinusI{
+    // Complex
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+        timesmi(a.v, out.v, 2*i);
+      }
+      
+      return out;
+    }
+  };
+
+  #undef timesmi
+  
+  #define timesi(a, b, i)\
+  b[i]   = -a[i+1];\
+  b[i+1] = a[i];
+  
+  struct TimesI{
+    // Complex
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+        timesi(a.v, out.v, 2*i);
+      }
+      
+      return out;
+    }
+  };
+  
+  #undef timesi
+
+  struct PrecisionChange {
+    static inline vech StoH (const vecf &a,const vecf &b) {
+      vech ret;
+#ifdef USE_FP16
+      vech *ha = (vech *)&a;
+      vech *hb = (vech *)&b;
+      const int nf = W<float>::r;
+      //      VECTOR_FOR(i, nf,1){ ret.v[i]    = ( (uint16_t *) &a.v[i])[1] ; }
+      //      VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; }
+      VECTOR_FOR(i, nf,1){ ret.v[i]    = ha->v[2*i+1]; }
+      VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; }
+#else
+      assert(0);
+#endif
+      return ret;
+    }
+    static inline void  HtoS (vech h,vecf &sa,vecf &sb) {
+#ifdef USE_FP16
+      const int nf = W<float>::r;
+      const int nh = W<uint16_t>::r;
+      vech *ha = (vech *)&sa;
+      vech *hb = (vech *)&sb;
+      VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; }
+      //      VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];}
+      //      VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];}
+      VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; }
+      VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; }
+#else
+      assert(0);
+#endif
+    }
+    static inline vecf DtoS (vecd a,vecd b) {
+      const int nd = W<double>::r;
+      const int nf = W<float>::r;
+      vecf ret;
+      VECTOR_FOR(i, nd,1){ ret.v[i]    = a.v[i] ; }
+      VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; }
+      return ret;
+    }
+    static inline void StoD (vecf s,vecd &a,vecd &b) {
+      const int nd = W<double>::r;
+      VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; }
+      VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; }
+    }
+    static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) {
+      vecf sa,sb;
+      sa = DtoS(a,b);
+      sb = DtoS(c,d);
+      return StoH(sa,sb);
+    }
+    static inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) {
+      vecf sa,sb;
+      HtoS(h,sa,sb);
+      StoD(sa,a,b);
+      StoD(sb,c,d);
+    }
+  };
+
+  //////////////////////////////////////////////
+  // Exchange support
+  struct Exchange{
+
+    template <typename T,int n>
+    static inline void ExchangeN(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
+      const int w = W<T>::r;
+      unsigned int mask = w >> (n + 1);
+      //      std::cout << " Exchange "<<n<<" nsimd "<<w<<" mask 0x" <<std::hex<<mask<<std::dec<<std::endl;
+      VECTOR_FOR(i, w, 1) {	
+	int j1 = i&(~mask);
+	if  ( (i&mask) == 0 ) { out1.v[i]=in1.v[j1];}
+	else                  { out1.v[i]=in2.v[j1];}
+	int j2 = i|mask;
+	if  ( (i&mask) == 0 ) { out2.v[i]=in1.v[j2];}
+	else                  { out2.v[i]=in2.v[j2];}
+      }      
+    }
+    template <typename T>
+    static inline void Exchange0(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
+      ExchangeN<T,0>(out1,out2,in1,in2);
+    };
+    template <typename T>
+    static inline void Exchange1(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
+      ExchangeN<T,1>(out1,out2,in1,in2);
+    };
+    template <typename T>
+    static inline void Exchange2(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
+      ExchangeN<T,2>(out1,out2,in1,in2);
+    };
+    template <typename T>
+    static inline void Exchange3(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
+      ExchangeN<T,3>(out1,out2,in1,in2);
+    };
+  };
+
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+  #define perm(a, b, n, w)\
+  unsigned int _mask = w >> (n + 1);\
+  VECTOR_FOR(i, w, 1)\
+  {\
+    b[i] = a[i^_mask];\
+  }
+  
+  #define DECL_PERMUTE_N(n)\
+  template <typename T>\
+  static inline vec<T> Permute##n(vec<T> in) {\
+    vec<T> out;\
+    perm(in.v, out.v, n, W<T>::r);\
+    return out;\
+  }
+  
+  struct Permute{
+    DECL_PERMUTE_N(0);
+    DECL_PERMUTE_N(1);
+    DECL_PERMUTE_N(2);
+    DECL_PERMUTE_N(3);
+  };
+  
+  #undef perm
+  #undef DECL_PERMUTE_N
+  
+  #define rot(a, b, n, w)\
+  VECTOR_FOR(i, w, 1)\
+  {\
+    b[i] = a[(i + n)%w];\
+  }
+  
+  struct Rotate{
+      
+    template <int n, typename T> static inline vec<T> tRotate(vec<T> in){
+      return rotate(in, n);
+    }
+    
+    template <typename T>
+    static inline vec<T> rotate(vec<T> in, int n){
+      vec<T> out;
+      
+      rot(in.v, out.v, n, W<T>::r);
+      
+      return out;
+    }
+  };
+
+  #undef rot
+  
+  #define acc(v, a, off, step, n)\
+  for (unsigned int i = off; i < n; i += step)\
+  {\
+    a += v[i];\
+  }
+  
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+  
+  //Complex float Reduce
+  template <>
+  inline Grid::ComplexF Reduce<Grid::ComplexF, vecf>::operator()(vecf in){
+    float a = 0.f, b = 0.f;
+    
+    acc(in.v, a, 0, 2, W<float>::r);
+    acc(in.v, b, 1, 2, W<float>::r);
+    
+    return Grid::ComplexF(a, b);
+  }
+  
+  //Real float Reduce
+  template<>
+  inline Grid::RealF Reduce<Grid::RealF, vecf>::operator()(vecf in){
+    float a = 0.;
+    
+    acc(in.v, a, 0, 1, W<float>::r);
+    
+    return a;
+  }
+  
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD Reduce<Grid::ComplexD, vecd>::operator()(vecd in){
+    double a = 0., b = 0.;
+    
+    acc(in.v, a, 0, 2, W<double>::r);
+    acc(in.v, b, 1, 2, W<double>::r);
+    
+    return Grid::ComplexD(a, b);
+  }
+  
+  //Real double Reduce
+  template<>
+  inline Grid::RealD Reduce<Grid::RealD, vecd>::operator()(vecd in){
+    double a = 0.f;
+    
+    acc(in.v, a, 0, 1, W<double>::r);
+    
+    return a;
+  }
+
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, veci>::operator()(veci in){
+    Integer a = 0;
+    
+    acc(in.v, a, 0, 1, W<Integer>::r);
+    
+    return a;
+  }
+
+  #undef acc  // EIGEN compatibility
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types 
+
+  typedef Optimization::vech SIMD_Htype; // Reduced precision type
+  typedef Optimization::vecf SIMD_Ftype; // Single precision type
+  typedef Optimization::vecd SIMD_Dtype; // Double precision type
+  typedef Optimization::veci SIMD_Itype; // Integer type
+
+  // prefetch utilities
+  inline void v_prefetch0(int size, const char *ptr){};
+  inline void prefetch_HINT_T0(const char *ptr){};
+
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+}
--- a/Grid/simd/Grid_generic_types.h
+++ b/Grid/simd/Grid_generic_types.h
@@ -0,0 +1,85 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Grid_generic_types.h
+
+    Copyright (C) 2017
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+        Andrew Lawson    <andrew.lawson1991@gmail.com>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
+
+//#define VECTOR_LOOPS
+
+// playing with compiler pragmas
+#ifdef VECTOR_LOOPS
+#ifdef __clang__
+#define VECTOR_FOR(i, w, inc)\
+_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\
+for (unsigned int i = 0; i < w; i += inc)
+#elif defined __INTEL_COMPILER
+#define VECTOR_FOR(i, w, inc)\
+_Pragma("simd vectorlength(w*8)")\
+for (unsigned int i = 0; i < w; i += inc)
+#else
+#define VECTOR_FOR(i, w, inc)\
+for (unsigned int i = 0; i < w; i += inc)
+#endif
+#else
+#define VECTOR_FOR(i, w, inc)\
+for (unsigned int i = 0; i < w; i += inc)
+#endif
+
+namespace Grid {
+namespace Optimization {
+
+  // type traits giving the number of elements for each vector type
+  template <typename T> struct W;
+  template <> struct W<double> {
+    constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
+    constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
+  };
+  template <> struct W<float> {
+    constexpr static unsigned int c = GEN_SIMD_WIDTH/8u;
+    constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
+  };
+  template <> struct W<Integer> {
+    constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
+  };
+  template <> struct W<uint16_t> {
+    constexpr static unsigned int c = GEN_SIMD_WIDTH/4u;
+    constexpr static unsigned int r = GEN_SIMD_WIDTH/2u;
+  };
+  
+  // SIMD vector types
+  template <typename T>
+  struct vec {
+    alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
+  };
+
+  typedef vec<float>     vecf;
+  typedef vec<double>    vecd;
+  typedef vec<uint16_t>  vech; // half precision comms
+  typedef vec<Integer>   veci;
+  
+}}
--- a/Grid/simd/Grid_imci.h
+++ b/Grid/simd/Grid_imci.h
@@ -0,0 +1,448 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Grid_imci.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <immintrin.h>
+#include <zmmintrin.h>
+
+namespace Grid{
+namespace Optimization {
+  
+  struct Vsplat{
+    //Complex float
+    inline __m512 operator()(float a, float b){
+      return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
+    }
+    // Real float
+    inline __m512 operator()(float a){
+      return _mm512_set1_ps(a);
+    }
+    //Complex double
+    inline __m512d operator()(double a, double b){
+      return _mm512_set_pd(b,a,b,a,b,a,b,a);
+    }
+    //Real double
+    inline __m512d operator()(double a){
+      return _mm512_set1_pd(a);
+    }
+    //Integer
+    inline __m512i operator()(Integer a){
+      return _mm512_set1_epi32(a);
+    }
+  };
+
+  struct Vstore{
+    //Float 
+    inline void operator()(__m512 a, float* F){
+      _mm512_store_ps(F,a);
+    }
+    //Double
+    inline void operator()(__m512d a, double* D){
+      _mm512_store_pd(D,a);
+    }
+    //Integer
+    inline void operator()(__m512i a, Integer* I){
+      _mm512_store_si512((__m512i *)I,a);
+    }
+
+  };
+
+
+  struct Vstream{
+    //Float
+    inline void operator()(float * a, __m512 b){
+      _mm512_storenrngo_ps(a,b);
+    }
+    //Double
+    inline void operator()(double * a, __m512d b){
+      _mm512_storenrngo_pd(a,b);
+    }
+
+
+  };
+
+
+
+  struct Vset{
+    // Complex float 
+    inline __m512 operator()(Grid::ComplexF *a){
+      return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
+			   a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
+			   a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
+			   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Complex double 
+    inline __m512d operator()(Grid::ComplexD *a){
+      return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
+			   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Real float 
+    inline __m512 operator()(float *a){
+      return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+			    a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+    // Real double
+    inline __m512d operator()(double *a){
+      return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+    // Integer
+    inline __m512i operator()(Integer *a){
+      return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+			       a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+
+
+  };
+
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+
+
+ 
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_add_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_add_pd(a,b);
+    }
+    //Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_add_epi32(a,b);
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_sub_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_sub_pd(a,b);
+    }
+    //Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_sub_epi32(a,b);
+    }
+  };
+
+
+  struct MultComplex{
+    // Complex float
+    inline __m512 operator()(__m512 a, __m512 b){
+      __m512 vzero,ymm0,ymm1,real, imag;
+      vzero = _mm512_setzero_ps();
+      ymm0  = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // 
+      real  = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
+      imag  = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
+      ymm1  = _mm512_mul_ps(real, b);
+      ymm0  = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
+      return _mm512_fmadd_ps(ymm0,imag,ymm1);
+    }
+    // Complex double
+    inline __m512d operator()(__m512d a, __m512d b){
+      /* This is from
+       * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets 
+       * @inproceedings{McFarlin:2011:ASV:1995896.1995938,
+       * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
+       * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
+       * booktitle = {Proceedings of the International Conference on Supercomputing},
+       * series = {ICS '11},
+       * year = {2011},
+       * isbn = {978-1-4503-0102-2},
+       * location = {Tucson, Arizona, USA},
+       * pages = {265--274},
+       * numpages = {10},
+       * url = {http://doi.acm.org/10.1145/1995896.1995938},
+       * doi = {10.1145/1995896.1995938},
+       * acmid = {1995938},
+       * publisher = {ACM},
+       * address = {New York, NY, USA},
+       * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
+       *                } 
+       */
+      __m512d vzero,ymm0,ymm1,real,imag;
+      vzero =_mm512_setzero_pd();
+      ymm0 =  _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // 
+      real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
+      imag =  _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
+      ymm1 =  _mm512_mul_pd(real, b);
+      ymm0 =  _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
+      return  _mm512_fmadd_pd(ymm0,imag,ymm1);
+    }
+  };
+  
+  struct Mult{
+
+    inline void mac(__m512 &a, __m512 b, __m512 c){         
+       a= _mm512_fmadd_ps( b, c, a);                         
+    }
+
+    inline void mac(__m512d &a, __m512d b, __m512d c){
+      a= _mm512_fmadd_pd( b, c, a);                   
+    }                                             
+
+    // Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_mul_ps(a,b);
+    }
+    // Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_mul_pd(a,b);
+    }
+    // Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_mullo_epi32(a,b);
+    }
+  };
+
+  struct Div{
+    // Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_div_ps(a,b);
+    }
+    // Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_div_pd(a,b);
+    }
+  };
+
+
+  struct Conj{
+    // Complex single
+    inline __m512 operator()(__m512 in){
+      return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag  
+    }
+    // Complex double
+    inline __m512d operator()(__m512d in){
+      return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
+    }
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline __m512 operator()(__m512 in, __m512 ret){
+      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
+      return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
+    }
+    //Complex double
+    inline __m512d operator()(__m512d in, __m512d ret){
+      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
+      return  _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
+    }
+
+
+  };
+
+  struct TimesI{
+    //Complex single
+    inline __m512 operator()(__m512 in, __m512 ret){
+      __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
+      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
+    }
+    //Complex double
+    inline __m512d operator()(__m512d in, __m512d ret){
+      __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
+      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
+    }
+
+
+  };
+
+
+   struct Permute{
+    
+    static inline __m512 Permute0(__m512 in){
+      return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute1(__m512 in){
+      return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512 Permute2(__m512 in){
+      return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
+    };
+    static inline __m512 Permute3(__m512 in){
+      return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB); 
+    };
+
+    static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
+      return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512d Permute1(__m512d in){
+      return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
+    };
+    static inline __m512d Permute2(__m512d in){
+      return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
+    };
+    static inline __m512d Permute3(__m512d in){
+      return in;
+    };
+
+  };
+ 
+  struct Rotate{
+
+    static inline __m512 rotate(__m512 in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+
+      case 8 : return tRotate<8>(in);break;
+      case 9 : return tRotate<9>(in);break;
+      case 10: return tRotate<10>(in);break;
+      case 11: return tRotate<11>(in);break;
+      case 12: return tRotate<12>(in);break;
+      case 13: return tRotate<13>(in);break;
+      case 14: return tRotate<14>(in);break;
+      case 15: return tRotate<15>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m512d rotate(__m512d in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+      default: assert(0);
+      }
+    }
+
+    template<int n> static inline __m512 tRotate(__m512 in){ 
+      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
+    };
+
+    template<int n> static inline __m512d tRotate(__m512d in){ 
+      return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);          
+    };
+
+  };
+
+
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+  
+  //Complex float Reduce
+  template<>
+  inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
+    return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
+  }
+  //Real float Reduce
+  template<>
+  inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
+    return _mm512_reduce_add_ps(in);
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
+    return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
+  }
+  
+  //Real double Reduce
+  template<>
+  inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
+    return _mm512_reduce_add_pd(in);
+  }
+
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
+    return _mm512_reduce_add_epi32(in);
+  }
+  
+  
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types 
+
+  typedef __m512 SIMD_Ftype;  // Single precision type
+  typedef __m512d SIMD_Dtype; // Double precision type
+  typedef __m512i SIMD_Itype; // Integer type
+
+  // prefecth
+  inline void v_prefetch0(int size, const char *ptr){
+    for(int i=0;i<size;i+=64){ //  Define L1 linesize above
+      _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
+      _mm_prefetch(ptr+i+512,_MM_HINT_T0);
+    }
+  }
+  inline void prefetch_HINT_T0(const char *ptr){
+    _mm_prefetch(ptr,_MM_HINT_T0);
+  }
+
+
+  
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+
+}
--- a/Grid/simd/Grid_neon.h
+++ b/Grid/simd/Grid_neon.h
@@ -0,0 +1,599 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/simd/Grid_neon.h
+
+    Copyright (C) 2015
+
+    Author: Nils Meyer <nils.meyer@ur.de>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+/*
+
+  ARMv8 NEON intrinsics layer by
+
+  Nils Meyer <nils.meyer@ur.de>,
+  University of Regensburg, Germany
+  SFB/TRR55
+
+*/
+
+#ifndef GEN_SIMD_WIDTH
+#define GEN_SIMD_WIDTH 16u
+#endif
+
+#include "Grid_generic_types.h"
+#include <arm_neon.h>
+
+namespace Grid {
+namespace Optimization {
+
+  template<class vtype>
+  union uconv {
+    float32x4_t f;
+    vtype v;
+  };
+  union u128f {
+    float32x4_t v;
+    float f[4];
+  };
+  union u128d {
+    float64x2_t v;
+    double f[2];
+  };
+  // half precision
+  union u128h {
+    float16x8_t v;
+    uint16_t f[8];
+  };
+
+  struct Vsplat{
+    //Complex float
+    inline float32x4_t operator()(float a, float b){
+      float tmp[4]={a,b,a,b};
+      return vld1q_f32(tmp);
+    }
+    // Real float
+    inline float32x4_t operator()(float a){
+      return vdupq_n_f32(a);
+    }
+    //Complex double
+    inline float64x2_t operator()(double a, double b){
+      double tmp[2]={a,b};
+      return vld1q_f64(tmp);
+    }
+    //Real double
+    inline float64x2_t operator()(double a){
+      return vdupq_n_f64(a);
+    }
+    //Integer
+    inline uint32x4_t operator()(Integer a){
+      return vdupq_n_u32(a);
+    }
+  };
+
+  struct Vstore{
+    //Float
+    inline void operator()(float32x4_t a, float* F){
+      vst1q_f32(F, a);
+    }
+    //Double
+    inline void operator()(float64x2_t a, double* D){
+      vst1q_f64(D, a);
+    }
+    //Integer
+    inline void operator()(uint32x4_t a, Integer* I){
+      vst1q_u32(I, a);
+    }
+
+  };
+
+  struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
+    //Float // N:generic
+    inline void operator()(float * a, float32x4_t b){
+      memcpy(a,&b,4*sizeof(float));
+    }
+    //Double // N:generic
+    inline void operator()(double * a, float64x2_t b){
+      memcpy(a,&b,2*sizeof(double));
+    }
+
+
+  };
+
+  // Nils: Vset untested; not used currently in Grid at all;
+  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
+  struct Vset{
+    // Complex float
+    inline float32x4_t operator()(Grid::ComplexF *a){
+      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
+      return vld1q_f32(tmp);
+    }
+    // Complex double
+    inline float64x2_t operator()(Grid::ComplexD *a){
+      double tmp[2]={a[0].imag(),a[0].real()};
+      return vld1q_f64(tmp);
+    }
+    // Real float
+    inline float32x4_t operator()(float *a){
+      float tmp[4]={a[3],a[2],a[1],a[0]};
+      return vld1q_f32(tmp);
+    }
+    // Real double
+    inline float64x2_t operator()(double *a){
+      double tmp[2]={a[1],a[0]};
+      return vld1q_f64(tmp);
+    }
+    // Integer
+    inline uint32x4_t operator()(Integer *a){
+      return vld1q_dup_u32(a);
+    }
+  };
+
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+      inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      return vaddq_f32(a,b);
+    }
+    //Complex/Real double
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vaddq_f64(a,b);
+    }
+    //Integer
+    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
+      return vaddq_u32(a,b);
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      return vsubq_f32(a,b);
+    }
+    //Complex/Real double
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vsubq_f64(a,b);
+    }
+    //Integer
+    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
+      return vsubq_u32(a,b);
+    }
+  };
+
+  struct MultRealPart{
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      float32x4_t re = vtrn1q_f32(a, a);
+      return vmulq_f32(re, b);
+    }
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      float64x2_t re = vzip1q_f64(a, a);
+      return vmulq_f64(re, b);
+    }
+  };
+
+  struct MaddRealPart{
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
+      float32x4_t re = vtrn1q_f32(a, a);
+      return vfmaq_f32(c, re, b);
+    }
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
+      float64x2_t re = vzip1q_f64(a, a);
+      return vfmaq_f64(c, re, b);
+    }
+  };
+
+  struct Div{
+    // Real float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      return vdivq_f32(a, b);
+    }
+    // Real double
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vdivq_f64(a, b);
+    }
+  };
+
+  struct MultComplex{
+    // Complex float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+
+      float32x4_t r0, r1, r2, r3, r4;
+
+      // a = ar ai Ar Ai
+      // b = br bi Br Bi
+      // collect real/imag part, negate bi and Bi
+      r0 = vtrn1q_f32(b, b);       //  br  br  Br  Br
+      r1 = vnegq_f32(b);           // -br -bi -Br -Bi
+      r2 = vtrn2q_f32(b, r1);      //  bi -bi  Bi -Bi
+
+      // the fun part
+      r3 = vmulq_f32(r2, a);       //  bi*ar -bi*ai ...
+      r4 = vrev64q_f32(r3);        // -bi*ai  bi*ar ...
+
+      // fma(a,b,c) = a+b*c
+      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...
+
+      // no fma, use mul and add
+      // float32x4_t r5;
+      // r5 = vmulq_f32(r0, a);
+      // return vaddq_f32(r4, r5);
+    }
+    // Complex double
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+
+      float64x2_t r0, r1, r2, r3, r4;
+
+      // b = br bi
+      // collect real/imag part, negate bi
+      r0 = vtrn1q_f64(b, b);       //  br  br
+      r1 = vnegq_f64(b);           // -br -bi
+      r2 = vtrn2q_f64(b, r1);      //  bi -bi
+
+      // the fun part
+      r3 = vmulq_f64(r2, a);       //  bi*ar -bi*ai
+      r4 = vextq_f64(r3,r3,1);     // -bi*ai  bi*ar
+
+      // fma(a,b,c) = a+b*c
+      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi
+
+      // no fma, use mul and add
+      // float64x2_t r5;
+      // r5 = vmulq_f64(r0, a);
+      // return vaddq_f64(r4, r5);
+    }
+  };
+
+  struct Mult{
+    // Real float
+    inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
+      //return vaddq_f32(vmulq_f32(b,c),a);
+      return vfmaq_f32(a, b, c);
+    }
+    inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
+      //return vaddq_f64(vmulq_f64(b,c),a);
+      return vfmaq_f64(a, b, c);
+    }
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      return vmulq_f32(a,b);
+    }
+    // Real double
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vmulq_f64(a,b);
+    }
+    // Integer
+    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
+      return vmulq_u32(a,b);
+    }
+  };
+
+  struct Conj{
+    // Complex single
+    inline float32x4_t operator()(float32x4_t in){
+      // ar ai br bi -> ar -ai br -bi
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
+      return vtrn1q_f32(in, r1); //  ar -ai  br -bi
+    }
+    // Complex double
+    inline float64x2_t operator()(float64x2_t in){
+
+      float64x2_t r0, r1;
+      r0 = vextq_f64(in, in, 1);    //  ai  ar
+      r1 = vnegq_f64(r0);           // -ai -ar
+      return vextq_f64(r0, r1, 1);  //  ar -ai
+    }
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+      // ar ai br bi -> ai -ar ai -br
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(in);      //  ai  ar  bi  br
+      return vtrn1q_f32(r1, r0); //  ar -ai  br -bi
+    }
+    //Complex double
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
+      // a ib -> b -ia
+      float64x2_t tmp;
+      tmp = vnegq_f64(in);
+      return vextq_f64(in, tmp, 1);
+    }
+  };
+
+  struct TimesI{
+    //Complex single
+    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+      // ar ai br bi -> -ai ar -bi br
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
+      return vtrn1q_f32(r1, in); // -ai  ar -bi  br
+    }
+    //Complex double
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
+      // a ib -> -b ia
+      float64x2_t tmp;
+      tmp = vnegq_f64(in);
+      return vextq_f64(tmp, in, 1);
+    }
+  };
+
+  struct Permute{
+
+    static inline float32x4_t Permute0(float32x4_t in){ // N:ok
+      // AB CD -> CD AB
+      return vextq_f32(in, in, 2);
+    };
+    static inline float32x4_t Permute1(float32x4_t in){ // N:ok
+      // AB CD -> BA DC
+      return vrev64q_f32(in);
+    };
+    static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
+      return in;
+    };
+
+    static inline float64x2_t Permute0(float64x2_t in){ // N:ok
+      // AB -> BA
+      return vextq_f64(in, in, 1);
+    };
+    static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+
+  };
+
+  struct Rotate{
+
+    static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
+      switch(n){
+      case 0: // AB CD -> AB CD
+        return tRotate<0>(in);
+        break;
+      case 1: // AB CD -> BC DA
+        return tRotate<1>(in);
+        break;
+      case 2: // AB CD -> CD AB
+        return tRotate<2>(in);
+        break;
+      case 3: // AB CD -> DA BC
+        return tRotate<3>(in);
+        break;
+      default: assert(0);
+      }
+    }
+    static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
+      switch(n){
+      case 0: // AB -> AB
+        return tRotate<0>(in);
+        break;
+      case 1: // AB -> BA
+        return tRotate<1>(in);
+        break;
+      default: assert(0);
+      }
+    }
+
+    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
+    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
+
+  };
+
+  struct PrecisionChange {
+
+    static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
+      float16x4_t h = vcvt_f16_f32(a);
+      return vcvt_high_f16_f32(h, b);
+    }
+    static inline void  HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
+      sb = vcvt_high_f32_f16(h);
+      // there is no direct conversion from lower float32x4_t to float64x2_t
+      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
+      // float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
+      // workaround for clang
+      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
+      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
+      sa = vcvt_high_f32_f16(h1);
+    }
+    static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
+      float32x2_t s = vcvt_f32_f64(a);
+      return vcvt_high_f32_f64(s, b);
+
+    }
+    static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
+      b = vcvt_high_f64_f32(s);
+      // there is no direct conversion from lower float32x4_t to float64x2_t
+      float32x4_t s1 = vextq_f32(s, s, 2);
+      a = vcvt_high_f64_f32(s1);
+
+    }
+    static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
+      float32x4_t s1 = DtoS(a, b);
+      float32x4_t s2 = DtoS(c, d);
+      return StoH(s1, s2);
+    }
+    static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
+      float32x4_t s1, s2;
+      HtoS(h, s1, s2);
+      StoD(s1, a, b);
+      StoD(s2, c, d);
+    }
+  };
+
+  //////////////////////////////////////////////
+  // Exchange support
+
+  struct Exchange{
+    static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      // in1: ABCD -> out1: ABEF
+      // in2: EFGH -> out2: CDGH
+
+      // z: CDAB
+      float32x4_t z = vextq_f32(in1, in1, 2);
+      // out1: ABEF
+      out1 = vextq_f32(z, in2, 2);
+
+      // z: GHEF
+      z = vextq_f32(in2, in2, 2);
+      // out2: CDGH
+      out2 = vextq_f32(in1, z, 2);
+    };
+
+    static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      // in1: ABCD -> out1: AECG
+      // in2: EFGH -> out2: BFDH
+      out1 = vtrn1q_f32(in1, in2);
+      out2 = vtrn2q_f32(in1, in2);
+    };
+    static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      assert(0);
+      return;
+    };
+    // double precision
+    static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      // in1: AB -> out1: AC
+      // in2: CD -> out2: BD
+      out1 = vzip1q_f64(in1, in2);
+      out2 = vzip2q_f64(in1, in2);
+    };
+    static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+  };
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+
+
+  //Complex float Reduce
+  template<>
+  inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
+    float32x4_t v1; // two complex
+    v1 = Optimization::Permute::Permute0(in);
+    v1 = vaddq_f32(v1,in);
+    u128f conv;    conv.v=v1;
+    return Grid::ComplexF(conv.f[0],conv.f[1]);
+  }
+  //Real float Reduce
+  template<>
+  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
+    return vaddvq_f32(in);
+  }
+
+
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
+    u128d conv; conv.v = in;
+    return Grid::ComplexD(conv.f[0],conv.f[1]);
+  }
+
+  //Real double Reduce
+  template<>
+  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
+    return vaddvq_f64(in);
+  }
+
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
+    return vaddvq_u32(in);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types
+
+// typedef Optimization::vech SIMD_Htype; // Reduced precision type
+  typedef float16x8_t  SIMD_Htype; // Half precision type
+  typedef float32x4_t  SIMD_Ftype; // Single precision type
+  typedef float64x2_t  SIMD_Dtype; // Double precision type
+  typedef uint32x4_t   SIMD_Itype; // Integer type
+
+  inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
+  inline void prefetch_HINT_T0(const char *ptr){};
+
+
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+
+
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+
+}
+
--- a/Grid/simd/Grid_qpx.h
+++ b/Grid/simd/Grid_qpx.h
@@ -0,0 +1,619 @@
+/*******************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: ./lib/simd/Grid_qpx.h
+ 
+ Copyright (C) 2016
+ Copyright (C) 2017
+ 
+ Author: Antonin Portelli <antonin.portelli@me.com>
+         Andrew Lawson    <andrew.lawson1991@gmail.com>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ ******************************************************************************/
+
+#ifndef GEN_SIMD_WIDTH
+#define GEN_SIMD_WIDTH 32u
+#endif
+#include "Grid_generic_types.h" // Definitions for simulated integer SIMD.
+
+namespace Grid {
+
+#ifdef QPX
+#include <spi/include/kernel/location.h>
+#include <spi/include/l1p/types.h>
+#include <hwi/include/bqc/l1p_mmio.h>
+#include <hwi/include/bqc/A2_inlines.h>
+#endif
+
+namespace Optimization {
+  typedef struct 
+  {
+    float v0,v1,v2,v3;
+  } vector4float;
+
+  inline std::ostream & operator<<(std::ostream& stream, const vector4double a)
+  {
+    stream << "{"<<vec_extract(a,0)<<","<<vec_extract(a,1)<<","<<vec_extract(a,2)<<","<<vec_extract(a,3)<<"}";
+    return stream;
+  };
+
+  inline std::ostream & operator<<(std::ostream& stream, const vector4float a)
+  {
+    stream << "{"<< a.v0 <<","<< a.v1 <<","<< a.v2 <<","<< a.v3 <<"}";
+    return stream;
+  };
+  
+  struct Vsplat{
+    //Complex float
+    inline vector4float operator()(float a, float b){
+      return (vector4float){a, b, a, b};
+    }
+    // Real float
+    inline vector4float operator()(float a){
+      return (vector4float){a, a, a, a};
+    }
+    //Complex double
+    inline vector4double operator()(double a, double b){
+      return (vector4double){a, b, a, b};
+    }
+    //Real double
+    inline vector4double operator()(double a){
+      return (vector4double){a, a, a, a};
+    }
+    //Integer
+    inline veci operator()(Integer a){
+      veci out;
+      
+      VECTOR_FOR(i, W<Integer>::r, 1)
+      {
+        out.v[i] = a;
+      }
+      
+      return out;
+    }
+  };
+  
+  struct Vstore{
+    //Float
+    inline void operator()(vector4double a, float *f){
+      vec_st(a, 0, f);
+    }
+
+    inline void operator()(vector4double a, vector4float &f){
+      vec_st(a, 0, (float *)(&f));
+    }
+
+    inline void operator()(vector4float a, float *f){
+      f[0] = a.v0;
+      f[1] = a.v1;
+      f[2] = a.v2;
+      f[3] = a.v3;
+    }
+
+    //Double
+    inline void operator()(vector4double a, double *d){
+      vec_st(a, 0, d);
+    }
+
+    //Integer
+    inline void operator()(veci a, Integer *i){
+      *((veci *)i) = a;
+    }
+  };
+  
+  struct Vstream{
+    //Float
+    inline void operator()(float *f, vector4double a){
+      vec_st(a, 0, f);
+    }
+
+    inline void operator()(vector4float f, vector4double a){
+      vec_st(a, 0, (float *)(&f));
+    }
+
+    inline void operator()(float *f, vector4float a){
+      f[0] = a.v0;
+      f[1] = a.v1;
+      f[2] = a.v2;
+      f[3] = a.v3;
+    }
+    //Double
+    inline void operator()(double *d, vector4double a){
+      vec_st(a, 0, d);
+    }
+
+  };
+  
+  struct Vset{
+    // Complex float
+    inline vector4float operator()(Grid::ComplexF *a){
+      return (vector4float){a[0].real(), a[0].imag(), a[1].real(), a[1].imag()};
+    }
+    // Complex double
+    inline vector4double operator()(Grid::ComplexD *a){
+      return vec_ld(0, (double *)a);
+    }
+
+    // Real float
+    inline vector4float operator()(float *a){
+      return (vector4float){a[0], a[1], a[2], a[3]};
+    }
+
+    inline vector4double operator()(vector4float a){
+      return vec_ld(0, (float *)(&a));
+    }
+
+    // Real double
+    inline vector4double operator()(double *a){
+      return vec_ld(0, a);
+    }
+    // Integer
+    inline veci operator()(Integer *a){
+      veci out;
+      
+      out = *((veci *)a);
+      
+      return out;
+    }    
+  };
+  
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+  
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+
+  #define FLOAT_WRAP_3(fn, pref)\
+  pref vector4float fn(vector4float a, vector4float b, vector4float c)	\
+  {\
+    vector4double ad, bd, rd, cd;			\
+    vector4float  r;\
+    \
+    ad = Vset()(a);\
+    bd = Vset()(b);\
+    cd = Vset()(c);\
+    rd = fn(ad, bd, cd);				\
+    Vstore()(rd, r);\
+    \
+    return r;\
+  }
+
+  #define FLOAT_WRAP_2(fn, pref)\
+  pref vector4float fn(vector4float a, vector4float b)\
+  {\
+    vector4double ad, bd, rd;\
+    vector4float  r;\
+    \
+    ad = Vset()(a);\
+    bd = Vset()(b);\
+    rd = fn(ad, bd);\
+    Vstore()(rd, r);\
+    \
+    return r;\
+  }
+
+  #define FLOAT_WRAP_1(fn, pref)\
+  pref vector4float fn(vector4float a)\
+  {\
+    vector4double ad, rd;\
+    vector4float  r;\
+    \
+    ad = Vset()(a);\
+    rd = fn(ad);\
+    Vstore()(rd, r);\
+    \
+    return r;\
+  }
+
+  struct Sum{
+    //Complex/Real double
+    inline vector4double operator()(vector4double a, vector4double b){
+      return vec_add(a, b);
+    }
+
+    //Complex/Real float
+    FLOAT_WRAP_2(operator(), inline)
+
+    //Integer
+    inline veci operator()(veci a, veci b){
+      veci out;
+      
+      VECTOR_FOR(i, W<Integer>::r, 1)
+      {
+        out.v[i] = a.v[i] + b.v[i];
+      }
+      
+      return out;
+    }
+  };
+  
+  struct Sub{
+    //Complex/Real double
+    inline vector4double operator()(vector4double a, vector4double b){
+      return vec_sub(a, b);
+    }
+
+    //Complex/Real float
+    FLOAT_WRAP_2(operator(), inline)
+
+    //Integer
+    inline veci operator()(veci a, veci b){
+      veci out;
+      
+      VECTOR_FOR(i, W<Integer>::r, 1)
+      {
+        out.v[i] = a.v[i] - b.v[i];
+      }
+      
+      return out;
+    }
+  };
+  
+  struct MultRealPart{
+    // Complex double
+    inline vector4double operator()(vector4double a, vector4double b){
+  //      return vec_xmul(b, a);
+        return vec_xmul(a, b);
+    }
+    FLOAT_WRAP_2(operator(), inline)
+  };
+  struct MaddRealPart{
+    // Complex double
+    inline vector4double operator()(vector4double a, vector4double b,vector4double c){
+      return vec_xmadd(a, b, c);
+    }
+    FLOAT_WRAP_3(operator(), inline)
+  };
+  struct MultComplex{
+    // Complex double
+    inline vector4double operator()(vector4double a, vector4double b){
+      return vec_xxnpmadd(a, b, vec_xmul(b, a));
+    }
+
+    // Complex float
+    FLOAT_WRAP_2(operator(), inline)
+  };
+  
+  struct Mult{
+    // Real double
+    inline vector4double operator()(vector4double a, vector4double b){
+      return vec_mul(a, b);
+    }
+
+    // Real float
+    FLOAT_WRAP_2(operator(), inline)
+
+    // Integer
+    inline veci operator()(veci a, veci b){
+      veci out;
+      
+      VECTOR_FOR(i, W<Integer>::r, 1)
+      {
+        out.v[i] = a.v[i]*b.v[i];
+      }
+      
+      return out;
+    }
+  };
+
+  struct Div{
+    // Real double
+    inline vector4double operator()(vector4double a, vector4double b){
+      return vec_swdiv(a, b);
+    }
+
+    // Real float
+    FLOAT_WRAP_2(operator(), inline)
+
+    // Integer
+    inline veci operator()(veci a, veci b){
+      veci out;
+      
+      VECTOR_FOR(i, W<Integer>::r, 1)
+      {
+        out.v[i] = a.v[i]/b.v[i];
+      }
+      
+      return out;
+    }
+  };
+
+  struct Conj{
+    // Complex double
+    inline vector4double operator()(vector4double v){
+      return vec_mul(v, (vector4double){1., -1., 1., -1.});
+    }
+
+    // Complex float
+    FLOAT_WRAP_1(operator(), inline)
+  };
+  
+  struct TimesMinusI{
+    //Complex double
+    inline vector4double operator()(vector4double v, vector4double ret){
+      return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
+                               (vector4double){0., 0., 0., 0.});
+    }
+
+    // Complex float
+    FLOAT_WRAP_2(operator(), inline)
+  };
+  
+  struct TimesI{
+    //Complex double
+    inline vector4double operator()(vector4double v, vector4double ret){
+      return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
+                              (vector4double){0., 0., 0., 0.});
+    }
+
+    // Complex float
+    FLOAT_WRAP_2(operator(), inline)
+  };
+#define USE_FP16
+  struct PrecisionChange {
+    static inline vech StoH (const vector4float &a, const vector4float &b) {
+      vech ret;
+      std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl;
+      assert(0);
+      return ret;
+    }
+    static inline void  HtoS (vech h, vector4float &sa, vector4float &sb) {
+      std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl;
+      assert(0);
+    }
+    static inline vector4float DtoS (vector4double a, vector4double b) {
+      vector4float ret;
+      std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl;
+      assert(0);
+      return ret;
+    }
+    static inline void StoD (vector4float s, vector4double &a, vector4double &b) {
+      std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl;
+      assert(0);
+    }
+    static inline vech DtoH (vector4double a, vector4double b, 
+                             vector4double c, vector4double d) {
+      vech ret;
+      std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl;
+      assert(0);
+      return ret;
+    }
+    static inline void HtoD (vech h, vector4double &a, vector4double &b, 
+                                     vector4double &c, vector4double &d) {
+      std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl;
+      assert(0);
+    }
+  };
+
+  //////////////////////////////////////////////
+  // Exchange support
+#define FLOAT_WRAP_EXCHANGE(fn) \
+  static inline void fn(vector4float &out1, vector4float &out2, \
+                        vector4float in1,  vector4float in2) \
+  { \
+    vector4double out1d, out2d, in1d, in2d; \
+    in1d  = Vset()(in1);   \
+    in2d  = Vset()(in2);   \
+    fn(out1d, out2d, in1d, in2d); \
+    Vstore()(out1d, out1); \
+    Vstore()(out2d, out2); \
+  }
+
+  struct Exchange{
+
+    // double precision
+    static inline void Exchange0(vector4double &out1, vector4double &out2,
+                                 vector4double in1,  vector4double in2) {
+      out1 = vec_perm(in1, in2, vec_gpci(0145));
+      out2 = vec_perm(in1, in2, vec_gpci(02367));
+    }
+    static inline void Exchange1(vector4double &out1, vector4double &out2,
+                                 vector4double in1,  vector4double in2) {
+      out1 = vec_perm(in1, in2, vec_gpci(0426));
+      out2 = vec_perm(in1, in2, vec_gpci(01537));
+    }
+    static inline void Exchange2(vector4double &out1, vector4double &out2,
+                                 vector4double in1,  vector4double in2) {
+      assert(0);
+    }
+    static inline void Exchange3(vector4double &out1, vector4double &out2,
+                                 vector4double in1,  vector4double in2) {
+      assert(0);
+    }
+
+    // single precision
+    FLOAT_WRAP_EXCHANGE(Exchange0);
+    FLOAT_WRAP_EXCHANGE(Exchange1);
+    FLOAT_WRAP_EXCHANGE(Exchange2);
+    FLOAT_WRAP_EXCHANGE(Exchange3);
+  };
+
+  struct Permute{
+    //Complex double
+    static inline vector4double Permute0(vector4double v){ //0123 -> 2301
+      return vec_perm(v, v, vec_gpci(02301));
+    };
+    static inline vector4double Permute1(vector4double v){ //0123 -> 1032
+      return vec_perm(v, v, vec_gpci(01032));
+    };
+    static inline vector4double Permute2(vector4double v){
+      return v;
+    };
+    static inline vector4double Permute3(vector4double v){
+      return v;
+    };
+
+    // Complex float
+    FLOAT_WRAP_1(Permute0, static inline)
+    FLOAT_WRAP_1(Permute1, static inline)
+    FLOAT_WRAP_1(Permute2, static inline)
+    FLOAT_WRAP_1(Permute3, static inline)
+  };
+  
+  struct Rotate{
+
+    template<int n> static inline vector4double tRotate(vector4double v){ 
+      if ( n==1 ) return vec_perm(v, v, vec_gpci(01230));
+      if ( n==2 ) return vec_perm(v, v, vec_gpci(02301));
+      if ( n==3 ) return vec_perm(v, v, vec_gpci(03012));
+      return v;
+    };
+    template<int n> static inline vector4float tRotate(vector4float a)	
+    {					       
+      vector4double ad, rd;
+      vector4float  r;
+      ad = Vset()(a);
+      rd = tRotate<n>(ad);
+      Vstore()(rd, r);
+      return r;
+    };
+
+    static inline vector4double rotate(vector4double v, int n){
+      switch(n){
+        case 0:
+          return v;
+          break;
+        case 1:
+          return tRotate<1>(v);
+          break;
+        case 2:
+          return tRotate<2>(v);
+          break;
+        case 3:
+          return tRotate<3>(v);
+          break;
+        default: assert(0);
+      }
+    }
+
+    static inline vector4float rotate(vector4float v, int n){
+      vector4double vd, rd;
+      vector4float  r;
+      vd = Vset()(v);
+      rd = rotate(vd, n);
+      Vstore()(rd, r);
+      return r;
+    }
+  };
+  
+  //Complex float Reduce
+  template<>
+  inline Grid::ComplexF
+  Reduce<Grid::ComplexF, vector4float>::operator()(vector4float v) { //2 complex
+    vector4float v1,v2;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = Optimization::Sum()(v1, v);
+    
+    return Grid::ComplexF(v1.v0, v1.v1);
+  }
+  //Real float Reduce
+  template<>
+  inline Grid::RealF
+  Reduce<Grid::RealF, vector4float>::operator()(vector4float v){ //4 floats
+    vector4float v1,v2;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = Optimization::Sum()(v1, v);
+    v2 = Optimization::Permute::Permute1(v1);
+    v1 = Optimization::Sum()(v1, v2);
+    
+    return v1.v0;
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD
+  Reduce<Grid::ComplexD, vector4double>::operator()(vector4double v){ //2 complex
+    vector4double v1;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = vec_add(v1, v);
+    
+    return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1));
+  }
+  
+  //Real double Reduce
+  template<>
+  inline Grid::RealD
+  Reduce<Grid::RealD, vector4double>::operator()(vector4double v){ //4 doubles
+    vector4double v1,v2;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = vec_add(v1, v);
+    v2 = Optimization::Permute::Permute1(v1);
+    v1 = vec_add(v1, v2);
+
+    return vec_extract(v1, 0);
+  }
+  
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, veci>::operator()(veci in){
+    Integer a = 0;
+    for (unsigned int i = 0; i < W<Integer>::r; ++i)
+    {
+        a += in.v[i];
+    }
+    return a;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Here assign types
+typedef Optimization::vech         SIMD_Htype;  // Half precision type
+typedef Optimization::vector4float SIMD_Ftype;  // Single precision type
+typedef vector4double              SIMD_Dtype; // Double precision type
+typedef Optimization::veci         SIMD_Itype; // Integer type
+
+// prefetch utilities
+inline void v_prefetch0(int size, const char *ptr){};
+inline void prefetch_HINT_T0(const char *ptr){};
+
+
+// Function name aliases
+typedef Optimization::Vsplat   VsplatSIMD;
+typedef Optimization::Vstore   VstoreSIMD;
+typedef Optimization::Vset     VsetSIMD;
+typedef Optimization::Vstream  VstreamSIMD;
+template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+// Arithmetic operations
+typedef Optimization::Sum         SumSIMD;
+typedef Optimization::Sub         SubSIMD;
+typedef Optimization::Mult        MultSIMD;
+typedef Optimization::Div         DivSIMD;
+typedef Optimization::MultComplex MultComplexSIMD;
+typedef Optimization::MultRealPart MultRealPartSIMD;
+typedef Optimization::MaddRealPart MaddRealPartSIMD;
+typedef Optimization::Conj        ConjSIMD;
+typedef Optimization::TimesMinusI TimesMinusISIMD;
+typedef Optimization::TimesI      TimesISIMD;
+  
+}
--- a/Grid/simd/Grid_sse4.h
+++ b/Grid/simd/Grid_sse4.h
@@ -0,0 +1,617 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Grid_sse4.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+//----------------------------------------------------------------------
+/*! @file Grid_sse4.h
+  @brief Optimization libraries for SSE4 instructions set
+
+  Using intrinsics
+*/
+// Time-stamp: <2015-06-16 23:27:54 neo>
+//----------------------------------------------------------------------
+
+#include <pmmintrin.h>
+
+namespace Grid {
+namespace Optimization {
+
+  template<class vtype>
+  union uconv {
+    __m128 f;
+    vtype v;
+  };
+
+  union u128f {
+    __m128 v;
+    float f[4];
+  };
+  union u128d {
+    __m128d v;
+    double f[2];
+  };
+  
+  struct Vsplat{
+    //Complex float
+    inline __m128 operator()(float a, float b){
+      return _mm_set_ps(b,a,b,a);
+    }
+    // Real float
+    inline __m128 operator()(float a){
+      return _mm_set_ps(a,a,a,a);
+    }
+    //Complex double
+    inline __m128d operator()(double a, double b){
+      return _mm_set_pd(b,a);
+    }
+    //Real double
+    inline __m128d operator()(double a){
+      return _mm_set_pd(a,a);
+    }
+    //Integer
+    inline __m128i operator()(Integer a){
+      return _mm_set1_epi32(a);
+    }
+  };
+
+  struct Vstore{
+    //Float 
+    inline void operator()(__m128 a, float* F){
+      _mm_store_ps(F,a);
+    }
+    //Double
+    inline void operator()(__m128d a, double* D){
+      _mm_store_pd(D,a);
+    }
+    //Integer
+    inline void operator()(__m128i a, Integer* I){
+      _mm_store_si128((__m128i *)I,a);
+    }
+
+  };
+
+  struct Vstream{
+    //Float
+    inline void operator()(float * a, __m128 b){
+      _mm_stream_ps(a,b);
+    }
+    //Double
+    inline void operator()(double * a, __m128d b){
+      _mm_stream_pd(a,b);
+    }
+
+
+  };
+
+  struct Vset{
+    // Complex float 
+    inline __m128 operator()(Grid::ComplexF *a){
+      return _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Complex double 
+    inline __m128d operator()(Grid::ComplexD *a){
+      return _mm_set_pd(a[0].imag(),a[0].real());
+    }
+    // Real float 
+    inline __m128 operator()(float *a){
+      return _mm_set_ps(a[3],a[2],a[1],a[0]);
+    }
+    // Real double
+    inline __m128d operator()(double *a){
+      return _mm_set_pd(a[1],a[0]);
+    }
+    // Integer
+    inline __m128i operator()(Integer *a){
+      return _mm_set_epi32(a[3],a[2],a[1],a[0]);
+    }
+
+
+  };
+
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline __m128 operator()(__m128 a, __m128 b){
+      return _mm_add_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m128d operator()(__m128d a, __m128d b){
+      return _mm_add_pd(a,b);
+    }
+    //Integer
+    inline __m128i operator()(__m128i a, __m128i b){
+      return _mm_add_epi32(a,b);
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline __m128 operator()(__m128 a, __m128 b){
+      return _mm_sub_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m128d operator()(__m128d a, __m128d b){
+      return _mm_sub_pd(a,b);
+    }
+    //Integer
+    inline __m128i operator()(__m128i a, __m128i b){
+      return _mm_sub_epi32(a,b);
+    }
+  };
+
+  struct MultRealPart{
+    inline __m128 operator()(__m128 a, __m128 b){
+      __m128 ymm0;
+      ymm0  = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
+      return  _mm_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
+    }
+    inline __m128d operator()(__m128d a, __m128d b){
+      __m128d ymm0;
+      ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
+      return _mm_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
+    }
+  };
+  struct MaddRealPart{
+    inline __m128 operator()(__m128 a, __m128 b, __m128 c){
+      __m128 ymm0 =  _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
+      return _mm_add_ps(_mm_mul_ps( ymm0, b),c);                         
+    }
+    inline __m128d operator()(__m128d a, __m128d b, __m128d c){
+      __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 );
+      return _mm_add_pd(_mm_mul_pd( ymm0, b),c);                         
+    }
+  };
+
+  struct MultComplex{
+    // Complex float
+    inline __m128 operator()(__m128 a, __m128 b){
+      __m128 ymm0,ymm1,ymm2;
+      ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
+      ymm0 = _mm_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
+      ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
+      ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
+      ymm1 = _mm_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
+      return _mm_addsub_ps(ymm0,ymm1);    
+    }
+    // Complex double
+    inline __m128d operator()(__m128d a, __m128d b){
+      __m128d ymm0,ymm1,ymm2;
+      ymm0 = _mm_shuffle_pd(a,a,0x0);   // ymm0 <- ar ar,
+      ymm0 = _mm_mul_pd(ymm0,b);        // ymm0 <- ar bi, ar br
+      ymm1 = _mm_shuffle_pd(b,b,0x1);   // ymm1 <- br,bi   b01
+      ymm2 = _mm_shuffle_pd(a,a,0x3);   // ymm2 <- ai,ai   b11
+      ymm1 = _mm_mul_pd(ymm1,ymm2);     // ymm1 <- br ai, ai bi
+      return _mm_addsub_pd(ymm0,ymm1);  
+    }
+  };
+
+  struct Mult{
+
+    inline void mac(__m128 &a, __m128 b, __m128 c){
+      a= _mm_add_ps(_mm_mul_ps(b,c),a);
+    }
+
+    inline void mac(__m128d &a, __m128d b, __m128d c){
+      a= _mm_add_pd(_mm_mul_pd(b,c),a);
+    }
+
+    // Real float
+    inline __m128 operator()(__m128 a, __m128 b){
+      return _mm_mul_ps(a,b);
+    }
+    // Real double
+    inline __m128d operator()(__m128d a, __m128d b){
+      return _mm_mul_pd(a,b);
+    }
+    // Integer
+    inline __m128i operator()(__m128i a, __m128i b){
+      return _mm_mullo_epi32(a,b);
+    }
+  };
+
+  struct Div{
+    // Real float
+    inline __m128 operator()(__m128 a, __m128 b){
+      return _mm_div_ps(a,b);
+    }
+    // Real double
+    inline __m128d operator()(__m128d a, __m128d b){
+      return _mm_div_pd(a,b);
+    }
+  };
+
+
+  struct Conj{
+    // Complex single
+    inline __m128 operator()(__m128 in){
+      return _mm_xor_ps(_mm_addsub_ps(_mm_setzero_ps(),in), _mm_set1_ps(-0.f));
+    }
+    // Complex double
+    inline __m128d operator()(__m128d in){
+      return _mm_xor_pd(_mm_addsub_pd(_mm_setzero_pd(),in), _mm_set1_pd(-0.f));//untested
+    }
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline __m128 operator()(__m128 in, __m128 ret){
+      __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
+      return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    }
+    //Complex double
+    inline __m128d operator()(__m128d in, __m128d ret){
+      __m128d tmp =_mm_addsub_pd(_mm_setzero_pd(),in); // r,-i
+      return _mm_shuffle_pd(tmp,tmp,0x1);
+    }
+
+
+  };
+
+  struct TimesI{
+    //Complex single
+    inline __m128 operator()(__m128 in, __m128 ret){
+      __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
+    }
+    //Complex double
+    inline __m128d operator()(__m128d in, __m128d ret){
+      __m128d tmp = _mm_shuffle_pd(in,in,0x1);
+      return _mm_addsub_pd(_mm_setzero_pd(),tmp); // r,-i
+    }
+  };
+
+  struct Permute{
+
+    static inline __m128 Permute0(__m128 in){
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
+    };
+    static inline __m128 Permute1(__m128 in){
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
+    };
+    static inline __m128 Permute2(__m128 in){
+      return in;
+    };
+    static inline __m128 Permute3(__m128 in){
+      return in;
+    };
+
+    static inline __m128d Permute0(__m128d in){ //AB -> BA
+      return _mm_shuffle_pd(in,in,0x1);
+    };
+    static inline __m128d Permute1(__m128d in){
+      return in;
+    };
+    static inline __m128d Permute2(__m128d in){
+      return in;
+    };
+    static inline __m128d Permute3(__m128d in){
+      return in;
+    };
+  };
+
+  
+#define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
+#define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
+
+#ifdef SFW_FP16
+
+  struct Grid_half {
+    Grid_half(){}
+    Grid_half(uint16_t raw) : x(raw) {}
+    uint16_t x;
+  };
+  union FP32 {
+    unsigned int u;
+    float f;
+  };
+
+  // PAB - Lifted and adapted from Eigen, which is GPL V2
+  inline float sfw_half_to_float(Grid_half h) {
+    const FP32 magic = { 113 << 23 };
+    const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+    FP32 o;
+    o.u = (h.x & 0x7fff) << 13;             // exponent/mantissa bits
+    unsigned int exp = shifted_exp & o.u;   // just the exponent
+    o.u += (127 - 15) << 23;                // exponent adjust
+    // handle exponent special cases
+    if (exp == shifted_exp) {     // Inf/NaN?
+      o.u += (128 - 16) << 23;    // extra exp adjust
+    } else if (exp == 0) {        // Zero/Denormal?
+      o.u += 1 << 23;             // extra exp adjust
+      o.f -= magic.f;             // renormalize
+    }
+    o.u |= (h.x & 0x8000) << 16;    // sign bit
+    return o.f;
+  }
+  inline Grid_half sfw_float_to_half(float ff) {
+    FP32 f; f.f = ff;
+    const FP32 f32infty = { 255 << 23 };
+    const FP32 f16max = { (127 + 16) << 23 };
+    const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+    unsigned int sign_mask = 0x80000000u;
+    Grid_half o;
+    
+    o.x = static_cast<unsigned short>(0x0u);
+    unsigned int sign = f.u & sign_mask;
+    f.u ^= sign;
+    // NOTE all the integer compares in this function can be safely
+    // compiled into signed compares since all operands are below
+    // 0x80000000. Important if you want fast straight SSE2 code
+    // (since there's no unsigned PCMPGTD).
+    if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
+      o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+    } else {  // (De)normalized number or zero
+      if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
+	// use a magic value to align our 10 mantissa bits at the bottom of
+	// the float. as long as FP addition is round-to-nearest-even this
+	// just works.
+	f.f += denorm_magic.f;
+	// and one integer subtract of the bias later, we have our final float!
+	o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
+      } else {
+	unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+	
+	// update exponent, rounding bias part 1
+	f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
+	// rounding bias part 2
+	f.u += mant_odd;
+	// take the bits!
+	o.x = static_cast<unsigned short>(f.u >> 13);
+      }
+    } 
+    o.x |= static_cast<unsigned short>(sign >> 16);
+    return o;
+  }
+  static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) {
+    __m128i ret=(__m128i)_mm_setzero_ps();
+    float *fp = (float *)&f;
+    Grid_half *hp = (Grid_half *)&ret;
+    hp[0] = sfw_float_to_half(fp[0]);
+    hp[1] = sfw_float_to_half(fp[1]);
+    hp[2] = sfw_float_to_half(fp[2]);
+    hp[3] = sfw_float_to_half(fp[3]);
+    return ret;
+  }
+  static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) {
+    __m128 ret=_mm_setzero_ps();
+    float *fp = (float *)&ret;
+    Grid_half  *hp = (Grid_half *)&h;
+    fp[0] = sfw_half_to_float(hp[0]);
+    fp[1] = sfw_half_to_float(hp[1]);
+    fp[2] = sfw_half_to_float(hp[2]);
+    fp[3] = sfw_half_to_float(hp[3]);
+    return ret;
+  }
+#else 
+#define Grid_mm_cvtps_ph _mm_cvtps_ph
+#define Grid_mm_cvtph_ps _mm_cvtph_ps
+#endif
+  struct PrecisionChange {
+    static inline __m128i StoH (__m128 a,__m128 b) {
+      __m128i ha = Grid_mm_cvtps_ph(a,0);
+      __m128i hb = Grid_mm_cvtps_ph(b,0);
+      __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      return h;
+    }
+    static inline void  HtoS (__m128i h,__m128 &sa,__m128 &sb) {
+      sa = Grid_mm_cvtph_ps(h,0); 
+      h =  (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2);
+      sb = Grid_mm_cvtph_ps(h,0);
+    }
+    static inline __m128 DtoS (__m128d a,__m128d b) {
+      __m128 sa = _mm_cvtpd_ps(a);
+      __m128 sb = _mm_cvtpd_ps(b);
+      __m128 s = _mm_shuffle_ps(sa,sb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      return s;
+    }
+    static inline void StoD (__m128 s,__m128d &a,__m128d &b) {
+      a = _mm_cvtps_pd(s);
+      s = (__m128)_my_alignr_epi32((__m128i)s,(__m128i)s,2);
+      b = _mm_cvtps_pd(s);
+    }
+    static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) {
+      __m128 sa,sb;
+      sa = DtoS(a,b);
+      sb = DtoS(c,d);
+      return StoH(sa,sb);
+    }
+    static inline void HtoD (__m128i h,__m128d &a,__m128d &b,__m128d &c,__m128d &d) {
+      __m128 sa,sb;
+      HtoS(h,sa,sb);
+      StoD(sa,a,b);
+      StoD(sb,c,d);
+    }
+  };
+
+  struct Exchange{
+    // 3210 ordering
+    static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
+      out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
+      out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/
+      out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/
+      out1= _mm_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
+      out2= _mm_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
+    };
+    static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
+      assert(0);
+      return;
+    };
+
+    static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
+      out1= _mm_shuffle_pd(in1,in2,0x0);
+      out2= _mm_shuffle_pd(in1,in2,0x3);
+    };
+    static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
+      assert(0);
+      return;
+    };
+  };
+
+  struct Rotate{
+
+    static inline __m128 rotate(__m128 in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m128d rotate(__m128d in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      default: assert(0);
+      }
+    }
+
+    template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); };
+    template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); };
+
+  };
+  //////////////////////////////////////////////
+  // Some Template specialization
+
+
+  //Complex float Reduce
+  template<>
+  inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
+    __m128 v1; // two complex
+    v1= Optimization::Permute::Permute0(in); 
+    v1= _mm_add_ps(v1,in);
+    u128f conv;    conv.v=v1;
+    return Grid::ComplexF(conv.f[0],conv.f[1]);
+  }
+  //Real float Reduce
+  template<>
+  inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
+    __m128 v1,v2; // quad single
+    v1= Optimization::Permute::Permute0(in); 
+    v1= _mm_add_ps(v1,in);
+    v2= Optimization::Permute::Permute1(v1); 
+    v1 = _mm_add_ps(v1,v2);
+    u128f conv; conv.v=v1;
+    return conv.f[0];
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD Reduce<Grid::ComplexD, __m128d>::operator()(__m128d in){
+    u128d conv; conv.v = in;
+    return Grid::ComplexD(conv.f[0],conv.f[1]);
+  }
+  
+  //Real double Reduce
+  template<>
+  inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
+    __m128d v1;
+    v1 = Optimization::Permute::Permute0(in); 
+    v1 = _mm_add_pd(v1,in);
+    u128d conv; conv.v = v1;
+    return conv.f[0];
+  }
+
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
+    __m128i v1 = _mm_hadd_epi32(in, in);
+    __m128i v2 = _mm_hadd_epi32(v1, v1);
+    return _mm_cvtsi128_si32(v2);
+  }
+}
+
+
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types 
+
+  typedef __m128i SIMD_Htype;  // Single precision type
+  typedef __m128  SIMD_Ftype;  // Single precision type
+  typedef __m128d SIMD_Dtype; // Double precision type
+  typedef __m128i SIMD_Itype; // Integer type
+
+  // prefetch utilities
+  inline void v_prefetch0(int size, const char *ptr){};
+  inline void prefetch_HINT_T0(const char *ptr){
+    _mm_prefetch(ptr,_MM_HINT_T0);
+  }
+
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+ 
+
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+
+}
--- a/Grid/simd/Grid_vector_types.h
+++ b/Grid/simd/Grid_vector_types.h
@@ -0,0 +1,868 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/simd/Grid_vector_type.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+//---------------------------------------------------------------------------
+/*! @file Grid_vector_types.h
+  @brief Defines templated class Grid_simd to deal with inner vector types
+*/
+// Time-stamp: <2015-07-10 17:45:33 neo>
+//---------------------------------------------------------------------------
+#ifndef GRID_VECTOR_TYPES
+#define GRID_VECTOR_TYPES
+
+#ifdef GEN
+#include "Grid_generic.h"
+#endif
+#ifdef SSE4
+#include "Grid_sse4.h"
+#endif
+#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
+#include "Grid_avx.h"
+#endif
+#if defined AVX512
+#include "Grid_avx512.h"
+#endif
+#if defined IMCI
+#include "Grid_imci.h"
+#endif
+#ifdef NEONV8
+#include "Grid_neon.h"
+#endif
+#if defined QPX
+#include "Grid_qpx.h"
+#endif
+
+#include "l1p.h"
+
+namespace Grid {
+
+//////////////////////////////////////
+// To take the floating point type of real/complex type
+//////////////////////////////////////
+template <typename T>
+struct RealPart {
+  typedef T type;
+};
+template <typename T>
+struct RealPart<std::complex<T> > {
+  typedef T type;
+};
+
+#include <type_traits>
+
+//////////////////////////////////////
+// demote a vector to real type
+//////////////////////////////////////
+// type alias used to simplify the syntax of std::enable_if
+template <typename T> using Invoke = typename T::type;
+template <typename Condition, typename ReturnType> using EnableIf    = Invoke<std::enable_if<Condition::value, ReturnType> >;
+template <typename Condition, typename ReturnType> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
+
+////////////////////////////////////////////////////////
+// Check for complexity with type traits
+template <typename T> struct is_complex : public std::false_type {};
+template <> struct is_complex<std::complex<double> > : public std::true_type {};
+template <> struct is_complex<std::complex<float> > : public std::true_type {};
+
+template <typename T>              using IfReal    = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
+template <typename T>              using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
+template <typename T>              using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
+template <typename T1,typename T2> using IfSame    = Invoke<std::enable_if<std::is_same<T1,T2>::value, int> >;
+
+template <typename T>              using IfNotReal    = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
+template <typename T>              using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
+template <typename T>              using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
+template <typename T1,typename T2> using IfNotSame    = Invoke<std::enable_if<!std::is_same<T1,T2>::value, int> >;
+
+////////////////////////////////////////////////////////
+// Define the operation templates functors
+// general forms to allow for vsplat syntax
+// need explicit declaration of types when used since
+// clang cannot automatically determine the output type sometimes
+template <class Out, class Input1, class Input2, class Input3, class Operation>
+Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
+  return op(src_1, src_2, src_3);
+}
+
+template <class Out, class Input1, class Input2, class Operation>
+Out binary(Input1 src_1, Input2 src_2, Operation op) {
+  return op(src_1, src_2);
+}
+
+template <class Out, class Input, class Operation>
+Out unary(Input src, Operation op) {
+  return op(src);
+}
+///////////////////////////////////////////////
+
+/*
+  @brief Grid_simd class for the SIMD vector type operations
+ */
+template <class Scalar_type, class Vector_type>
+class Grid_simd {
+ public:
+  typedef typename RealPart<Scalar_type>::type Real;
+  typedef Vector_type vector_type;
+  typedef Scalar_type scalar_type;
+
+  typedef union conv_t_union {
+    Vector_type v;
+    Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
+    conv_t_union(){};
+  } conv_t;
+
+  Vector_type v;
+
+  static inline constexpr int Nsimd(void) {
+    return sizeof(Vector_type) / sizeof(Scalar_type);
+  }
+
+  Grid_simd &operator=(const Grid_simd &&rhs) {
+    v = rhs.v;
+    return *this;
+  };
+  Grid_simd &operator=(const Grid_simd &rhs) {
+    v = rhs.v;
+    return *this;
+  };  // faster than not declaring it and leaving to the compiler
+  Grid_simd() = default;
+  Grid_simd(const Grid_simd &rhs) : v(rhs.v){};  // compiles in movaps
+  Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
+
+  /////////////////////////////
+  // Constructors
+  /////////////////////////////
+  Grid_simd &operator=(Zero &z) {
+    vzero(*this);
+    return (*this);
+  }
+
+  // Enable if complex type
+  template <typename S = Scalar_type>
+  Grid_simd(const typename std::enable_if<is_complex<S>::value, S>::type a) {
+    vsplat(*this, a);
+  };
+
+  Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
+
+  ///////////////////////////////////////////////
+  // mac, mult, sub, add, adj
+  ///////////////////////////////////////////////
+
+  // FIXME -- alias this to an inline MAC struct.
+  friend inline void mac(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ a,
+                         const Grid_simd *__restrict__ x) {
+    *y = (*a) * (*x) + (*y);
+  };
+
+  friend inline void mult(Grid_simd *__restrict__ y,
+                          const Grid_simd *__restrict__ l,
+                          const Grid_simd *__restrict__ r) {
+    *y = (*l) * (*r);
+  }
+
+  friend inline void sub(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) - (*r);
+  }
+  friend inline void add(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) + (*r);
+  }
+  friend inline void mac(Grid_simd *__restrict__ y,
+                         const Scalar_type *__restrict__ a,
+                         const Grid_simd *__restrict__ x) {
+    *y = (*a) * (*x) + (*y);
+  };
+  friend inline void mult(Grid_simd *__restrict__ y,
+                          const Scalar_type *__restrict__ l,
+                          const Grid_simd *__restrict__ r) {
+    *y = (*l) * (*r);
+  }
+  friend inline void sub(Grid_simd *__restrict__ y,
+                         const Scalar_type *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) - (*r);
+  }
+  friend inline void add(Grid_simd *__restrict__ y,
+                         const Scalar_type *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) + (*r);
+  }
+
+  friend inline void mac(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ a,
+                         const Scalar_type *__restrict__ x) {
+    *y = (*a) * (*x) + (*y);
+  };
+  friend inline void mult(Grid_simd *__restrict__ y,
+                          const Grid_simd *__restrict__ l,
+                          const Scalar_type *__restrict__ r) {
+    *y = (*l) * (*r);
+  }
+  friend inline void sub(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Scalar_type *__restrict__ r) {
+    *y = (*l) - (*r);
+  }
+  friend inline void add(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Scalar_type *__restrict__ r) {
+    *y = (*l) + (*r);
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // FIXME:  gonna remove these load/store, get, set, prefetch
+  ////////////////////////////////////////////////////////////////////////
+  friend inline void vset(Grid_simd &ret, Scalar_type *a) {
+    ret.v = unary<Vector_type>(a, VsetSIMD());
+  }
+
+  ///////////////////////
+  // Vstore
+  ///////////////////////
+  friend inline void vstore(const Grid_simd &ret, Scalar_type *a) {
+    binary<void>(ret.v, (Real *)a, VstoreSIMD());
+  }
+
+  ///////////////////////
+  // Vprefetch
+  ///////////////////////
+  friend inline void vprefetch(const Grid_simd &v) {
+    prefetch_HINT_T0((const char *)&v.v);
+  }
+
+  ///////////////////////
+  // Reduce
+  ///////////////////////
+  friend inline Scalar_type Reduce(const Grid_simd &in) {
+    return unary<Scalar_type>(in.v, ReduceSIMD<Scalar_type, Vector_type>());
+  }
+
+  ////////////////////////////
+  // operator scalar * simd
+  ////////////////////////////
+  friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
+    Grid_simd va;
+    vsplat(va, a);
+    return va * b;
+  }
+  friend inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) {
+    return a * b;
+  }
+
+  //////////////////////////////////
+  // Divides
+  //////////////////////////////////
+  friend inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) {
+    Grid_simd va;
+    vsplat(va, a);
+    return va / b;
+  }
+  friend inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) {
+    Grid_simd va;
+    vsplat(va, a);
+    return b / a;
+  }
+
+  ///////////////////////
+  // Unary negation
+  ///////////////////////
+  friend inline Grid_simd operator-(const Grid_simd &r) {
+    Grid_simd ret;
+    vzero(ret);
+    ret = ret - r;
+    return ret;
+  }
+  // *=,+=,-= operators
+  inline Grid_simd &operator*=(const Grid_simd &r) {
+    *this = (*this) * r;
+    return *this;
+    // return (*this)*r; ?
+  }
+  inline Grid_simd &operator+=(const Grid_simd &r) {
+    *this = *this + r;
+    return *this;
+  }
+  inline Grid_simd &operator-=(const Grid_simd &r) {
+    *this = *this - r;
+    return *this;
+  }
+
+  ///////////////////////////////////////
+  // Not all functions are supported
+  // through SIMD and must breakout to
+  // scalar type and back again. This
+  // provides support
+  ///////////////////////////////////////
+
+  template <class functor>
+  friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
+    Grid_simd ret;
+    Grid_simd::conv_t conv;
+    Grid_simd::scalar_type s;
+    
+    conv.v = v.v;
+    for (int i = 0; i < Nsimd(); i++) {
+      s = conv.s[i];
+      conv.s[i] = func(s);
+    }
+    ret.v = conv.v;
+    return ret;
+  }
+  template <class functor>
+  friend inline Grid_simd SimdApplyBinop(const functor &func,
+                                         const Grid_simd &x,
+                                         const Grid_simd &y) {
+    Grid_simd ret;
+    Grid_simd::conv_t cx;
+    Grid_simd::conv_t cy;
+    Grid_simd::scalar_type sx,sy;
+
+    cx.v = x.v;
+    cy.v = y.v;
+    for (int i = 0; i < Nsimd(); i++) {
+      sx = cx.s[i];
+      sy = cy.s[i];
+      cx.s[i] = func(sx,sy);
+    }
+    ret.v = cx.v;
+    return ret;
+  }
+  ///////////////////////
+  // Exchange 
+  // Al Ah , Bl Bh -> Al Bl Ah,Bh
+  ///////////////////////
+  friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
+  {
+    if       (n==3) {
+      Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
+    } else if(n==2) {
+      Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
+    } else if(n==1) {
+      Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
+    } else if(n==0) { 
+      Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
+    }
+  }
+  friend inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){    
+    Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
+  }
+  friend inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){    
+    Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
+  }
+  friend inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){    
+    Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
+  }
+  friend inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){    
+    Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
+  }
+  ////////////////////////////////////////////////////////////////////
+  // General permute; assumes vector length is same across
+  // all subtypes; may not be a good assumption, but could
+  // add the vector width as a template param for BG/Q for example
+  ////////////////////////////////////////////////////////////////////
+  friend inline void permute0(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute0(b.v);
+  }
+  friend inline void permute1(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute1(b.v);
+  }
+  friend inline void permute2(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute2(b.v);
+  }
+  friend inline void permute3(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute3(b.v);
+  }
+  friend inline void permute(Grid_simd &y, Grid_simd b, int perm) {
+    if (perm & RotateBit) {
+      int dist = perm & 0xF;
+      y = rotate(b, dist);
+      return;
+    } 
+    else if(perm==3) permute3(y, b);
+    else if(perm==2) permute2(y, b);
+    else if(perm==1) permute1(y, b);
+    else if(perm==0) permute0(y, b);
+  }
+
+  ///////////////////////////////
+  // Getting single lanes
+  ///////////////////////////////
+  inline Scalar_type getlane(int lane) {
+    return ((Scalar_type*)&v)[lane];
+  }
+
+  inline void putlane(const Scalar_type &S, int lane){
+    ((Scalar_type*)&v)[lane] = S;
+  }
+
+
+  
+};  // end of Grid_simd class definition
+
+inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
+inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
+inline void permute(RealD &y,RealD b, int perm) {  y=b; }
+inline void permute(RealF &y,RealF b, int perm) {  y=b; }
+
+////////////////////////////////////////////////////////////////////
+// General rotate
+////////////////////////////////////////////////////////////////////
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
+  nrot = nrot % Grid_simd<S, V>::Nsimd();
+  Grid_simd<S, V> ret;
+  ret.v = Optimization::Rotate::rotate(b.v, nrot);
+  return ret;
+}
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
+  nrot = nrot % Grid_simd<S, V>::Nsimd();
+  Grid_simd<S, V> ret;
+  ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> =0> 
+inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
+{
+  nrot = nrot % Grid_simd<S,V>::Nsimd();
+  ret.v = Optimization::Rotate::rotate(b.v,nrot);
+}
+template <class S, class V, IfComplex<S> =0> 
+inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
+{
+  nrot = nrot % Grid_simd<S,V>::Nsimd();
+  ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
+}
+
+template <class S, class V> 
+inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
+  S* typepun =(S*) &src;
+  vsplat(ret,typepun[lane]);
+}    
+template <class S, class V, IfComplex<S> =0> 
+inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
+  S* typepun =(S*) &src;
+  ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
+}    
+
+
+
+///////////////////////
+// Splat
+///////////////////////
+
+// this is only for the complex version
+template <class S, class V, IfComplex<S> = 0, class ABtype>
+inline void vsplat(Grid_simd<S, V> &ret, ABtype a, ABtype b) {
+  ret.v = binary<V>(a, b, VsplatSIMD());
+}
+
+// overload if complex
+template <class S, class V>
+inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
+  vsplat(ret, real(c), imag(c));
+}
+template <class S, class V>
+inline void rsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
+  vsplat(ret, real(c), real(c));
+}
+
+// if real fill with a, if complex fill with a in the real part (first function
+// above)
+template <class S, class V>
+inline void vsplat(Grid_simd<S, V> &ret, NotEnableIf<is_complex<S>, S> a) {
+  ret.v = unary<V>(a, VsplatSIMD());
+}
+//////////////////////////
+
+///////////////////////////////////////////////
+// Initialise to 1,0,i for the correct types
+///////////////////////////////////////////////
+// For complex types
+template <class S, class V, IfComplex<S> = 0>
+inline void vone(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(1.0, 0.0));
+}
+template <class S, class V, IfComplex<S> = 0>
+inline void vzero(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(0.0, 0.0));
+}  // use xor?
+template <class S, class V, IfComplex<S> = 0>
+inline void vcomplex_i(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(0.0, 1.0));
+}
+
+template <class S, class V, IfComplex<S> = 0>
+inline void visign(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(1.0, -1.0));
+}
+template <class S, class V, IfComplex<S> = 0>
+inline void vrsign(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(-1.0, 1.0));
+}
+
+// if not complex overload here
+template <class S, class V, IfReal<S> = 0>
+inline void vone(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(1.0));
+}
+template <class S, class V, IfReal<S> = 0>
+inline void vzero(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(0.0));
+}
+
+// For integral types
+template <class S, class V, IfInteger<S> = 0>
+inline void vone(Grid_simd<S, V> &ret) {
+  vsplat(ret, 1);
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vzero(Grid_simd<S, V> &ret) {
+  vsplat(ret, 0);
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vtrue(Grid_simd<S, V> &ret) {
+  vsplat(ret, 0xFFFFFFFF);
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vfalse(Grid_simd<S, V> &ret) {
+  vsplat(ret, 0);
+}
+template <class S, class V>
+inline void zeroit(Grid_simd<S, V> &z) {
+  vzero(z);
+}
+
+///////////////////////
+// Vstream
+///////////////////////
+template <class S, class V, IfReal<S> = 0>
+inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
+  binary<void>((S *)&out.v, in.v, VstreamSIMD());
+}
+template <class S, class V, IfComplex<S> = 0>
+inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
+  typedef typename S::value_type T;
+  binary<void>((T *)&out.v, in.v, VstreamSIMD());
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
+  out = in;
+}
+
+////////////////////////////////////
+// Arithmetic operator overloads +,-,*
+////////////////////////////////////
+template <class S, class V>
+inline Grid_simd<S, V> operator+(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, SumSIMD());
+  return ret;
+};
+
+template <class S, class V>
+inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, SubSIMD());
+  return ret;
+};
+
+// Distinguish between complex types and others
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
+  return ret;
+};
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
+  Grid_simd<S, V> ret;
+  ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
+  return ret;
+};
+
+
+// Distinguish between complex types and others
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, MultComplexSIMD());
+  return ret;
+};
+
+// Real/Integer types
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, MultSIMD());
+  return ret;
+};
+
+// Distinguish between complex types and others
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  typedef Grid_simd<S, V> simd;
+
+  simd ret;
+  simd den;
+  typename simd::conv_t conv;
+
+  ret = a * conjugate(b) ;
+  den = b * conjugate(b) ;
+
+  
+  auto real_den = toReal(den);
+
+  ret.v=binary<V>(ret.v, real_den.v, DivSIMD());
+
+  return ret;
+};
+
+// Real/Integer types
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, DivSIMD());
+  return ret;
+};
+
+///////////////////////
+// Conjugate
+///////////////////////
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
+  Grid_simd<S, V> ret;
+  ret.v = unary<V>(in.v, ConjSIMD());
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
+  return in;  // for real objects
+}
+// Suppress adj for integer types... // odd; why conjugate above but not adj??
+template <class S, class V, IfNotInteger<S> = 0>
+inline Grid_simd<S, V> adj(const Grid_simd<S, V> &in) {
+  return conjugate(in);
+}
+
+///////////////////////
+// timesMinusI
+///////////////////////
+template <class S, class V, IfComplex<S> = 0>
+inline void timesMinusI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
+  ret.v = binary<V>(in.v, ret.v, TimesMinusISIMD());
+}
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
+  Grid_simd<S, V> ret;
+  timesMinusI(ret, in);
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
+  return in;
+}
+
+///////////////////////
+// timesI
+///////////////////////
+template <class S, class V, IfComplex<S> = 0>
+inline void timesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
+  ret.v = binary<V>(in.v, ret.v, TimesISIMD());
+}
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
+  Grid_simd<S, V> ret;
+  timesI(ret, in);
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
+  return in;
+}
+
+/////////////////////
+// Inner, outer
+/////////////////////
+
+template <class S, class V>
+inline Grid_simd<S, V> innerProduct(const Grid_simd<S, V> &l,
+                                    const Grid_simd<S, V> &r) {
+  return conjugate(l) * r;
+}
+template <class S, class V>
+inline Grid_simd<S, V> outerProduct(const Grid_simd<S, V> &l,
+                                    const Grid_simd<S, V> &r) {
+  return l * conjugate(r);
+}
+
+template <class S, class V>
+inline Grid_simd<S, V> trace(const Grid_simd<S, V> &arg) {
+  return arg;
+}
+
+////////////////////////////////////////////////////////////
+// copy/splat complex real parts into real;
+// insert real into complex and zero imag;
+////////////////////////////////////////////////////////////
+
+// real = toReal( complex )
+template <class S, class V, IfReal<S> = 0>
+inline Grid_simd<S, V> toReal(const Grid_simd<std::complex<S>, V> &in) {
+  typedef Grid_simd<S, V> simd;
+  simd ret;
+  typename simd::conv_t conv;
+  conv.v = in.v;  // copy the vector content (bytewise)
+  for (int i = 0; i < simd::Nsimd(); i += 2) {
+    conv.s[i + 1] = conv.s[i];  // duplicate (r,r);(r,r);(r,r); etc...
+  }
+  ret.v = conv.v;
+  return ret;
+}
+
+// complex = toComplex( real )
+template <class R, class V, IfReal<R> = 0>  // must be a real arg
+inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
+  typedef Grid_simd<R, V> Rsimd;
+  typedef Grid_simd<std::complex<R>, V> Csimd;
+  typename Rsimd::conv_t conv;  // address as real
+
+  conv.v = in.v;
+  for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
+    assert(conv.s[i + 1] == conv.s[i]);  
+    // trap any cases where real was not duplicated
+    // indicating the SIMD grids of real and imag assignment did not correctly
+    // match
+    conv.s[i + 1] = 0.0;  // zero imaginary parts
+  }
+  Csimd ret;
+  ret.v = conv.v;
+  return ret;
+}
+
+///////////////////////////////
+// Define available types
+///////////////////////////////
+typedef Grid_simd<float, SIMD_Ftype> vRealF;
+typedef Grid_simd<double, SIMD_Dtype> vRealD;
+typedef Grid_simd<std::complex<float>, SIMD_Ftype> vComplexF;
+typedef Grid_simd<std::complex<double>, SIMD_Dtype> vComplexD;
+typedef Grid_simd<Integer, SIMD_Itype> vInteger;
+
+// Half precision; no arithmetic support
+typedef Grid_simd<uint16_t, SIMD_Htype>               vRealH;
+typedef Grid_simd<std::complex<uint16_t>, SIMD_Htype> vComplexH;
+
+inline void precisionChange(vRealF    *out,vRealD    *in,int nvec)
+{
+  assert((nvec&0x1)==0);
+  for(int m=0;m*2<nvec;m++){
+    int n=m*2;
+    out[m].v=Optimization::PrecisionChange::DtoS(in[n].v,in[n+1].v);
+  }
+}
+inline void precisionChange(vRealH    *out,vRealD    *in,int nvec)
+{
+  assert((nvec&0x3)==0);
+  for(int m=0;m*4<nvec;m++){
+    int n=m*4;
+    out[m].v=Optimization::PrecisionChange::DtoH(in[n].v,in[n+1].v,in[n+2].v,in[n+3].v);
+  }
+}
+inline void precisionChange(vRealH    *out,vRealF    *in,int nvec)
+{
+  assert((nvec&0x1)==0);
+  for(int m=0;m*2<nvec;m++){
+    int n=m*2;
+    out[m].v=Optimization::PrecisionChange::StoH(in[n].v,in[n+1].v);
+  }
+}
+inline void precisionChange(vRealD    *out,vRealF    *in,int nvec)
+{
+  assert((nvec&0x1)==0);
+  for(int m=0;m*2<nvec;m++){
+    int n=m*2;
+    Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v);
+  }
+}
+inline void precisionChange(vRealD    *out,vRealH    *in,int nvec)
+{
+  assert((nvec&0x3)==0);
+  for(int m=0;m*4<nvec;m++){
+    int n=m*4;
+    Optimization::PrecisionChange::HtoD(in[m].v,out[n].v,out[n+1].v,out[n+2].v,out[n+3].v);
+  }
+}
+inline void precisionChange(vRealF    *out,vRealH    *in,int nvec)
+{
+  assert((nvec&0x1)==0);
+  for(int m=0;m*2<nvec;m++){
+    int n=m*2;
+    Optimization::PrecisionChange::HtoS(in[m].v,out[n].v,out[n+1].v);
+  }
+}
+inline void precisionChange(vComplexF *out,vComplexD *in,int nvec){ precisionChange((vRealF *)out,(vRealD *)in,nvec);}
+inline void precisionChange(vComplexH *out,vComplexD *in,int nvec){ precisionChange((vRealH *)out,(vRealD *)in,nvec);}
+inline void precisionChange(vComplexH *out,vComplexF *in,int nvec){ precisionChange((vRealH *)out,(vRealF *)in,nvec);}
+inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionChange((vRealD *)out,(vRealF *)in,nvec);}
+inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
+inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
+
+// Check our vector types are of an appropriate size.
+#if defined QPX
+static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
+static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
+#else
+static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
+static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
+#endif
+
+/////////////////////////////////////////
+// Some traits to recognise the types
+/////////////////////////////////////////
+template <typename T>
+struct is_simd : public std::false_type {};
+template <> struct is_simd<vRealF>     : public std::true_type {};
+template <> struct is_simd<vRealD>     : public std::true_type {};
+template <> struct is_simd<vComplexF>  : public std::true_type {};
+template <> struct is_simd<vComplexD>  : public std::true_type {};
+template <> struct is_simd<vInteger>   : public std::true_type {};
+
+template <typename T> using IfSimd    = Invoke<std::enable_if<is_simd<T>::value, int> >;
+template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
+}
+
+#endif
--- a/Grid/simd/Grid_vector_unops.h
+++ b/Grid/simd/Grid_vector_unops.h
@@ -0,0 +1,223 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/simd/Grid_vector_unops.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_VECTOR_UNOPS
+#define GRID_VECTOR_UNOPS
+
+#include <cmath>
+
+namespace Grid {
+
+template <class scalar>
+struct SqrtRealFunctor {
+  scalar operator()(const scalar &a) const { return sqrt(real(a)); }
+};
+
+template <class scalar>
+struct RSqrtRealFunctor {
+  scalar operator()(const scalar &a) const {
+    return scalar(1.0 / sqrt(real(a)));
+  }
+};
+
+template <class scalar>
+struct CosRealFunctor {
+  scalar operator()(const scalar &a) const { return cos(real(a)); }
+};
+
+template <class scalar>
+struct SinRealFunctor {
+  scalar operator()(const scalar &a) const { return sin(real(a)); }
+};
+
+template <class scalar>
+struct AcosRealFunctor {
+  scalar operator()(const scalar &a) const { return acos(real(a)); }
+};
+
+template <class scalar>
+struct AsinRealFunctor {
+  scalar operator()(const scalar &a) const { return asin(real(a)); }
+};
+template <class scalar>
+struct LogRealFunctor {
+  scalar operator()(const scalar &a) const { return log(real(a)); }
+};
+template <class scalar>
+struct ExpFunctor {
+  scalar operator()(const scalar &a) const { return exp(a); }
+};
+template <class scalar>
+struct NotFunctor {
+  scalar operator()(const scalar &a) const { return (!a); }
+};
+template <class scalar>
+struct AbsRealFunctor {
+  scalar operator()(const scalar &a) const { return std::abs(real(a)); }
+};
+template <class scalar>
+struct PowRealFunctor {
+  double y;
+  PowRealFunctor(double _y) : y(_y){};
+  scalar operator()(const scalar &a) const { return pow(real(a), y); }
+};
+
+template <class scalar>
+struct ModIntFunctor {
+  Integer y;
+  ModIntFunctor(Integer _y) : y(_y){};
+  scalar operator()(const scalar &a) const { return Integer(a) % y; }
+};
+
+template <class scalar>
+struct DivIntFunctor {
+  Integer y;
+  DivIntFunctor(Integer _y) : y(_y){};
+  scalar operator()(const scalar &a) const { return Integer(a) / y; }
+};
+
+template <class scalar>
+struct RealFunctor {
+  scalar operator()(const scalar &a) const { return std::real(a); }
+};
+template <class scalar>
+struct ImagFunctor {
+  scalar operator()(const scalar &a) const { return std::imag(a); }
+};
+template <class S, class V>
+inline Grid_simd<S, V> real(const Grid_simd<S, V> &r) {
+  return SimdApply(RealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> imag(const Grid_simd<S, V> &r) {
+  return SimdApply(ImagFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> sqrt(const Grid_simd<S, V> &r) {
+  return SimdApply(SqrtRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> rsqrt(const Grid_simd<S, V> &r) {
+  return SimdApply(RSqrtRealFunctor<S>(), r);
+}
+template <class Scalar>
+inline Scalar rsqrt(const Scalar &r) {
+  return (RSqrtRealFunctor<Scalar>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
+  return SimdApply(CosRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> sin(const Grid_simd<S, V> &r) {
+  return SimdApply(SinRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> acos(const Grid_simd<S, V> &r) {
+  return SimdApply(AcosRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> asin(const Grid_simd<S, V> &r) {
+  return SimdApply(AsinRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> log(const Grid_simd<S, V> &r) {
+  return SimdApply(LogRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
+  return SimdApply(AbsRealFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
+  return SimdApply(ExpFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {
+  return SimdApply(NotFunctor<S>(), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> pow(const Grid_simd<S, V> &r, double y) {
+  return SimdApply(PowRealFunctor<S>(y), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> mod(const Grid_simd<S, V> &r, Integer y) {
+  return SimdApply(ModIntFunctor<S>(y), r);
+}
+template <class S, class V>
+inline Grid_simd<S, V> div(const Grid_simd<S, V> &r, Integer y) {
+  return SimdApply(DivIntFunctor<S>(y), r);
+}
+////////////////////////////////////////////////////////////////////////////
+// Allows us to assign into **conformable** real vectors from complex
+////////////////////////////////////////////////////////////////////////////
+template <class scalar>
+struct AndFunctor {
+  scalar operator()(const scalar &x, const scalar &y) const { return x & y; }
+};
+template <class scalar>
+struct OrFunctor {
+  scalar operator()(const scalar &x, const scalar &y) const { return x | y; }
+};
+template <class scalar>
+struct AndAndFunctor {
+  scalar operator()(const scalar &x, const scalar &y) const { return x && y; }
+};
+template <class scalar>
+struct OrOrFunctor {
+  scalar operator()(const scalar &x, const scalar &y) const { return x || y; }
+};
+
+////////////////////////////////
+// Calls to simd binop functors
+////////////////////////////////
+template <class S, class V>
+inline Grid_simd<S, V> operator&(const Grid_simd<S, V> &x,
+                                 const Grid_simd<S, V> &y) {
+  return SimdApplyBinop(AndFunctor<S>(), x, y);
+}
+template <class S, class V>
+inline Grid_simd<S, V> operator&&(const Grid_simd<S, V> &x,
+                                  const Grid_simd<S, V> &y) {
+  return SimdApplyBinop(AndAndFunctor<S>(), x, y);
+}
+template <class S, class V>
+inline Grid_simd<S, V> operator|(const Grid_simd<S, V> &x,
+                                 const Grid_simd<S, V> &y) {
+  return SimdApplyBinop(OrFunctor<S>(), x, y);
+}
+template <class S, class V>
+inline Grid_simd<S, V> operator||(const Grid_simd<S, V> &x,
+                                  const Grid_simd<S, V> &y) {
+  return SimdApplyBinop(OrOrFunctor<S>(), x, y);
+}
+}
+#endif
--- a/Grid/simd/IBM_qpx.h
+++ b/Grid/simd/IBM_qpx.h
@@ -0,0 +1,598 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/BGQQPX.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_BGQ_QPX_H
+#define GRID_ASM_BGQ_QPX_H
+
+#include <stdint.h>
+
+/*********************************************************
+ * Register definitions
+ *********************************************************/
+#define psi_00 0
+#define psi_01 1
+#define psi_02 2
+  
+#define psi_10 3
+#define psi_11 4
+#define psi_12 5
+
+#define psi_20 6
+#define psi_21 7
+#define psi_22 8
+
+#define psi_30 9
+#define psi_31 10
+#define psi_32 11
+
+#define Chi_00 12
+#define Chi_01 13
+#define Chi_02 14
+
+#define Chi_10 15
+#define Chi_11 16
+#define Chi_12 17  
+
+#define UChi_00 18 
+#define UChi_01 19
+#define UChi_02 20
+
+#define UChi_10 21
+#define UChi_11 22
+#define UChi_12 23 
+
+#define U0 24
+#define U1 25
+#define U2 26
+#define one 27
+#define perm_reg 28
+
+#define REP  %%r16
+#define IMM  %%r17
+#define pREP  %r16
+#define pIMM  %r17
+
+#define PPC_INST_DCBTLS 0x7c00014c
+#define PPC_INST_DCBLC  0x7c00030c
+#define __PPC_CT(t)     (((t) & 0x0f) << 21)
+#define ___PPC_RA(a)    (((a) & 0x1f) << 16)
+#define ___PPC_RB(b)    (((b) & 0x1f) << 11)
+
+#define LOCK_SET   ".long (" HASH(PPC_INST_DCBTLS) "|"  HASH(___PPC_RB(16)) ")\n"
+#define LOCK_CLEAR ".long (" HASH(PPC_INST_DCBLC) "|"  HASH(___PPC_RB(16)) ")\n"
+
+/*Alias regs for incoming fourspinor on neighbour site*/
+#define Chi_20 UChi_00
+#define Chi_21 UChi_01
+#define Chi_22 UChi_02
+#define Chi_30 UChi_10
+#define Chi_31 UChi_11
+#define Chi_32 UChi_12
+
+/*********************************************************
+ * Architectural macros
+ *********************************************************/
+#define HASHit(A)  #A 
+#define HASH(A)  HASHit(A)
+#define LOAD64(A,ptr)  
+
+
+#define MASK_REGS             /*NOOP ON BGQ*/
+#define PF_GAUGE(A)           /*NOOP ON BGQ*/
+#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
+#define PREFETCH_CHIMU(base)  /*NOOP ON BGQ*/
+
+#define VLOADf(OFF,PTR,DEST)         "qvlfsx      " #DEST  "," #PTR "," #OFF " ;\n"
+#define VLOADuf(OFF,PTR,DEST)        "qvlfsux     " #DEST  "," #PTR "," #OFF " ;\n"
+#define VSTOREf(OFF,PTR,SRC)         "qvstfsx     " #SRC  "," #PTR "," #OFF " ;\n"
+#define VSTOREuf(OFF,PTR,SRC)        "qvstfsux    " #SRC  "," #PTR "," #OFF " ;\n"
+#define VSPLATf(A,B,DEST)            "qvlfcsxa    " #DEST  "," #A "," #B ";\n"
+#define VSIZEf (16)
+
+#define VPERMIi(p)                 "qvgpci   " #p ", 1217;\n"
+#define VPERMi(A,p)                "qvfperm  " #A "," #A "," #A "," #p ";\n"
+#define VPERMI(p)                 VPERMIi(p)                 
+#define VPERM(A,p)                VPERMi(A,p)                
+
+#define VLOADd(OFF,PTR,DEST)         "qvlfdx      " #DEST  "," #PTR "," #OFF " ;\n"
+#define VLOADud(OFF,PTR,DEST)        "qvlfdux     " #DEST  "," #PTR "," #OFF " ;\n"
+#define VSTOREd(OFF,PTR,SRC)         "qvstfdx     " #SRC  "," #PTR "," #OFF " ;\n"
+#define VSTOREud(OFF,PTR,SRC)        "qvstfdux    " #SRC  "," #PTR "," #OFF " ;\n"
+#define VSPLATd(A,B,DEST)            "qvlfcdxa    " #DEST  "," #A "," #B ";\n"
+#define VSIZEd (32)
+
+// QPX manual ordering QRT comes first (dest)
+#define VZEROi(DEST)                  "qvfset       " #DEST "; \n qvfsub " #DEST ","  #DEST ","  #DEST ";\n" 
+#define VONEi(DEST)                   "qvfset       " #DEST "; \n" 
+#define VMOVi(DEST,A)                 "qvfmr        " #DEST "," #A   ";\n"
+#define VADDi(DEST,A,B)               "qvfadd       " #DEST "," #A "," #B  ";\n"
+#define VSUBi(DEST,A,B)               "qvfsub       " #DEST "," #A "," #B  ";\n"
+#define VMULi(DEST,A,B)               "qvfmul       " #DEST "," #A "," #B  ";\n"
+#define VMUL_RR_RIi(DEST,A,B)         "qvfxmul      " #DEST "," #A "," #B  ";\n" 
+#define VMADDi(DEST,A,B,C)            "qvfmadd      " #DEST "," #A "," #B ","#C ";\n"
+#define VMADD_RR_RIi(DEST,A,B,C)      "qvfxmadd     " #DEST "," #A "," #B ","#C ";\n" 
+#define VMADD_MII_IRi(DEST,A,B,C)     "qvfxxnpmadd  " #DEST "," #B "," #A ","#C ";\n" 
+#define VMADD_II_MIRi(DEST,A,B,C)     "qvfxxcpnmadd " #DEST "," #B "," #A ","#C ";\n"  
+
+#define VZERO(C)                  VZEROi(C)                  
+#define VONE(C)                   VONEi(C)                   
+#define VMOV(C,A)                 VMOVi(C,A)                 
+#define VADD(A,B,C)               VADDi(A,B,C)               
+#define VSUB(A,B,C)               VSUBi(A,B,C)               
+#define VMUL(A,B,C)               VMULi(A,B,C)               
+#define VMUL_RR_RI(A,B,C)         VMUL_RR_RIi(A,B,C)         
+#define VMADD(A,B,C,D)            VMADDi(A,B,C,D)            
+#define VMADD_RR_RI(A,B,C,D)      VMADD_RR_RIi(A,B,C,D)      
+#define VMADD_MII_IR(A,B,C,D)     VMADD_MII_IRi(A,B,C,D)     
+#define VMADD_II_MIR(A,B,C,D)     VMADD_II_MIRi(A,B,C,D)     
+
+/*********************************************************
+ * Macro sequences encoding QCD
+ *********************************************************/
+#define LOCK_GAUGE(dir)							\
+  {									\
+    uint64_t byte_addr = (uint64_t)&U._odata[sU];			\
+    int count = (sizeof(U._odata[0])+63)/64;				\
+    asm (" mtctr %0 \n"							\
+	 " mr " HASH(REP) ", %1\n"					\
+	 " li " HASH(IMM) ", 64\n"					\
+	 "0:\n"							\
+	 LOCK_SET							\
+	 "  add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n"		\
+	 "  bdnz 0b\n"						\
+	 : : "b" (count), "b" (byte_addr) );					\
+  }
+
+#define UNLOCK_GAUGE(dir)						\
+  {									\
+    uint64_t byte_addr = (uint64_t)&U._odata[sU];			\
+    int count = (sizeof(U._odata[0])+63)/64;				\
+    asm (" mtctr %0 \n"							\
+	 " mr " HASH(REP) ", %1\n"					\
+	 " li " HASH(IMM) ", 64\n"					\
+	 "0:\n"								\
+	 LOCK_CLEAR							\
+	 "  add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n"		\
+	 "  bdnz 0b\n"						\
+	 : : "b" (count), "b" (byte_addr) );					\
+  }
+
+#define ZERO_PSI				\
+  VZERO(psi_00)					\
+  VZERO(psi_01)					\
+  VZERO(psi_02)					\
+  VZERO(psi_10)					\
+  VZERO(psi_11)					\
+  VZERO(psi_12)					\
+  VZERO(psi_20)					\
+  VZERO(psi_21)					\
+  VZERO(psi_22)					\
+  VZERO(psi_30)					\
+  VZERO(psi_31)					\
+  VZERO(psi_32)
+
+#define MULT_2SPIN_QPX_LSd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,16) 
+#define MULT_2SPIN_QPX_LSf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,8) 
+#define MULT_2SPIN_QPXd(ptr,p)    MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,32) 
+#define MULT_2SPIN_QPXf(ptr,p)    MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,16) 
+
+#define MULT_2SPIN_QPX_INTERNAL(ptr,p,ULOAD,USKIP) {			\
+    uint64_t ub = ((uint64_t)ptr);				\
+    asm (							\
+         ULOAD(%0,%3,U0)					\
+         ULOAD(%1,%3,U1)					\
+         ULOAD(%2,%3,U2)					\
+	 VMUL_RR_RI(UChi_00,U0,Chi_00)					\
+	 VMUL_RR_RI(UChi_01,U1,Chi_00)					\
+	 VMUL_RR_RI(UChi_02,U2,Chi_00)					\
+	 VMUL_RR_RI(UChi_10,U0,Chi_10)					\
+	 VMUL_RR_RI(UChi_11,U1,Chi_10)					\
+	 VMUL_RR_RI(UChi_12,U2,Chi_10)					\
+	 VMADD_MII_IR(UChi_00,U0,Chi_00,UChi_00)			\
+	 VMADD_MII_IR(UChi_01,U1,Chi_00,UChi_01)			\
+	 VMADD_MII_IR(UChi_02,U2,Chi_00,UChi_02)			\
+	 VMADD_MII_IR(UChi_10,U0,Chi_10,UChi_10)			\
+	 VMADD_MII_IR(UChi_11,U1,Chi_10,UChi_11)			\
+	 VMADD_MII_IR(UChi_12,U2,Chi_10,UChi_12)			\
+	 : : "b" (0), "b" (USKIP*3), "b" (USKIP*6), "b" (ub ));		\
+    asm (								\
+         ULOAD(%0,%3,U0)						\
+         ULOAD(%1,%3,U1)						\
+         ULOAD(%2,%3,U2)						\
+	 VMADD_RR_RI(UChi_00,U0,Chi_01,UChi_00)				\
+	 VMADD_RR_RI(UChi_01,U1,Chi_01,UChi_01)				\
+	 VMADD_RR_RI(UChi_02,U2,Chi_01,UChi_02)				\
+	 VMADD_RR_RI(UChi_10,U0,Chi_11,UChi_10)				\
+	 VMADD_RR_RI(UChi_11,U1,Chi_11,UChi_11)				\
+	 VMADD_RR_RI(UChi_12,U2,Chi_11,UChi_12)				\
+	 VMADD_MII_IR(UChi_00,U0,Chi_01,UChi_00)			\
+	 VMADD_MII_IR(UChi_01,U1,Chi_01,UChi_01)			\
+	 VMADD_MII_IR(UChi_02,U2,Chi_01,UChi_02)			\
+	 VMADD_MII_IR(UChi_10,U0,Chi_11,UChi_10)			\
+	 VMADD_MII_IR(UChi_11,U1,Chi_11,UChi_11)			\
+	 VMADD_MII_IR(UChi_12,U2,Chi_11,UChi_12)			\
+	 : : "b" (USKIP*1), "b" (USKIP*4), "b" (USKIP*7), "b" (ub ));		\
+    asm (								\
+         ULOAD(%0,%3,U0)						\
+         ULOAD(%1,%3,U1)						\
+         ULOAD(%2,%3,U2)						\
+	 VMADD_RR_RI(UChi_00,U0,Chi_02,UChi_00)				\
+	 VMADD_RR_RI(UChi_01,U1,Chi_02,UChi_01)				\
+	 VMADD_RR_RI(UChi_02,U2,Chi_02,UChi_02)				\
+	 VMADD_RR_RI(UChi_10,U0,Chi_12,UChi_10)				\
+	 VMADD_RR_RI(UChi_11,U1,Chi_12,UChi_11)				\
+	 VMADD_RR_RI(UChi_12,U2,Chi_12,UChi_12)				\
+	 VMADD_MII_IR(UChi_00,U0,Chi_02,UChi_00)			\
+	 VMADD_MII_IR(UChi_01,U1,Chi_02,UChi_01)			\
+	 VMADD_MII_IR(UChi_02,U2,Chi_02,UChi_02)			\
+	 VMADD_MII_IR(UChi_10,U0,Chi_12,UChi_10)			\
+	 VMADD_MII_IR(UChi_11,U1,Chi_12,UChi_11)			\
+	 VMADD_MII_IR(UChi_12,U2,Chi_12,UChi_12)			\
+	 : : "b" (USKIP*2), "b" (USKIP*5), "b" (USKIP*8), "b" (ub ));		\
+  }
+
+
+#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p)
+#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
+
+#define SAVE_RESULT(base,basep) {\
+    uint64_t ub = ((uint64_t)base)  - (VSIZE);			\
+    asm("mr " HASH(REP)  ", %0;\n"					\
+	"li " HASH(IMM)      "," HASH(VSIZE)" ;\n"				\
+	VSTOREu(IMM,REP,psi_00)						\
+	VSTOREu(IMM,REP,psi_01)						\
+	VSTOREu(IMM,REP,psi_02)						\
+	VSTOREu(IMM,REP,psi_10)						\
+	VSTOREu(IMM,REP,psi_11)						\
+	VSTOREu(IMM,REP,psi_12)						\
+	VSTOREu(IMM,REP,psi_20)						\
+	VSTOREu(IMM,REP,psi_21)						\
+	VSTOREu(IMM,REP,psi_22)						\
+	VSTOREu(IMM,REP,psi_30)						\
+	VSTOREu(IMM,REP,psi_31)						\
+	VSTOREu(IMM,REP,psi_32)						\
+	: : "b" (ub) : HASH(pIMM), HASH(pREP) );				\
+  }
+
+
+/*
+ *Annoying BG/Q loads with no immediat indexing and big performance hit
+ *when second miss to a L1 line occurs
+ */
+#define LOAD_CHI(base) {						\
+    uint64_t ub = ((uint64_t)base)  - (2*VSIZE);			\
+    asm("mr  " HASH(REP) ",%0 ;\n"					\
+	"li  " HASH(IMM) ",(2*" HASH(VSIZE) ");\n"			\
+	VLOADu(IMM,REP,Chi_00)						\
+	VLOADu(IMM,REP,Chi_02)						\
+	VLOADu(IMM,REP,Chi_11) : : "b" (ub)  : HASH(pIMM), HASH(pREP)  ); \
+    ub = ((uint64_t)base)  - VSIZE;					\
+    asm("mr  " HASH(REP) ", %0;\n"					\
+	"li  " HASH(IMM) ",(2*" HASH(VSIZE) ");\n"			\
+	VLOADu(IMM,REP,Chi_01)						\
+	VLOADu(IMM,REP,Chi_10)						\
+	VLOADu(IMM,REP,Chi_12)	: : "b" (ub)  : HASH(pIMM), HASH(pREP) );	\
+  }
+
+#define LOAD_CHIMU(base) {						\
+    uint64_t ub = ((uint64_t)base)  - (2*VSIZE);			\
+    asm("mr " HASH(REP) ",%0;\n"					\
+	"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n"			\
+	VLOADu(IMM,REP,Chi_00)						\
+	VLOADu(IMM,REP,Chi_02)						\
+	VLOADu(IMM,REP,Chi_11)						\
+	VLOADu(IMM,REP,Chi_20)						\
+	VLOADu(IMM,REP,Chi_22)						\
+	VLOADu(IMM,REP,Chi_31) : : "b" (ub)  : HASH(pIMM), HASH(pREP) ); \
+    ub = ((uint64_t)base)  - VSIZE;					\
+    asm("mr " HASH(REP) ", %0;\n"					\
+	"li " HASH(IMM) ", (2*" HASH(VSIZE) ");\n"			\
+	VLOADu(IMM,REP,Chi_01)						\
+	VLOADu(IMM,REP,Chi_10)						\
+	VLOADu(IMM,REP,Chi_12)						\
+	VLOADu(IMM,REP,Chi_21)						\
+	VLOADu(IMM,REP,Chi_30)						\
+	VLOADu(IMM,REP,Chi_32)	: : "b" (ub)  : HASH(pIMM), HASH(pREP) );	\
+  }
+
+//      hspin(0)=fspin(0)+timesI(fspin(3));
+//      hspin(1)=fspin(1)+timesI(fspin(2));
+#define XP_PROJMEM(base) {					\
+    LOAD_CHIMU(base);						\
+    asm (							\
+         VONE(one)						\
+	 VMADD_MII_IR(Chi_00,one,Chi_30,Chi_00)			\
+	 VMADD_MII_IR(Chi_01,one,Chi_31,Chi_01)			\
+	 VMADD_MII_IR(Chi_02,one,Chi_32,Chi_02)			\
+	 VMADD_MII_IR(Chi_10,one,Chi_20,Chi_10)			\
+	 VMADD_MII_IR(Chi_11,one,Chi_21,Chi_11)			\
+	 VMADD_MII_IR(Chi_12,one,Chi_22,Chi_12)			\
+							);	\
+  }
+
+#define XM_PROJMEM(base) {				\
+    LOAD_CHIMU(base);					\
+    asm (						\
+         VONE(one)						\
+	 VMADD_II_MIR(Chi_00,one,Chi_30,Chi_00)			\
+	 VMADD_II_MIR(Chi_01,one,Chi_31,Chi_01)			\
+	 VMADD_II_MIR(Chi_02,one,Chi_32,Chi_02)			\
+	 VMADD_II_MIR(Chi_10,one,Chi_20,Chi_10)			\
+	 VMADD_II_MIR(Chi_11,one,Chi_21,Chi_11)			\
+	 VMADD_II_MIR(Chi_12,one,Chi_22,Chi_12)			\
+							);	\
+  }
+
+//      hspin(0)=fspin(0)-fspin(3);
+//      hspin(1)=fspin(1)+fspin(2);
+#define YP_PROJMEM(base) {  \
+    LOAD_CHIMU(base);						\
+    asm (							\
+	 VSUB(Chi_00,Chi_00,Chi_30)				\
+	 VSUB(Chi_01,Chi_01,Chi_31)				\
+	 VSUB(Chi_02,Chi_02,Chi_32)				\
+	 VADD(Chi_10,Chi_10,Chi_20)				\
+	 VADD(Chi_11,Chi_11,Chi_21)				\
+	 VADD(Chi_12,Chi_12,Chi_22)				\
+							);	\
+  }
+
+#define YM_PROJMEM(base) {			\
+    LOAD_CHIMU(base);						\
+    asm (							\
+	 VADD(Chi_00,Chi_00,Chi_30)				\
+	 VADD(Chi_01,Chi_01,Chi_31)				\
+	 VADD(Chi_02,Chi_02,Chi_32)				\
+	 VSUB(Chi_10,Chi_10,Chi_20)				\
+	 VSUB(Chi_11,Chi_11,Chi_21)				\
+	 VSUB(Chi_12,Chi_12,Chi_22)			);	\
+  }
+
+	    /*Gz
+	     *  0 0  i  0   [0]+-i[2]
+	     *  0 0  0 -i   [1]-+i[3]
+	     * -i 0  0  0
+	     *  0 i  0  0
+	     */
+#define ZP_PROJMEM(base) {  \
+    LOAD_CHIMU(base);						\
+    asm (							\
+         VONE(one)						\
+	 VMADD_MII_IR(Chi_00,one,Chi_20,Chi_00)			\
+	 VMADD_MII_IR(Chi_01,one,Chi_21,Chi_01)			\
+	 VMADD_MII_IR(Chi_02,one,Chi_22,Chi_02)			\
+	 VMADD_II_MIR(Chi_10,one,Chi_30,Chi_10)			\
+	 VMADD_II_MIR(Chi_11,one,Chi_31,Chi_11)			\
+	 VMADD_II_MIR(Chi_12,one,Chi_32,Chi_12)			\
+							);	\
+  }
+
+#define ZM_PROJMEM(base) {  \
+    LOAD_CHIMU(base);						\
+    asm (							\
+         VONE(one)						\
+	 VMADD_II_MIR(Chi_00,one,Chi_20,Chi_00)			\
+	 VMADD_II_MIR(Chi_01,one,Chi_21,Chi_01)			\
+	 VMADD_II_MIR(Chi_02,one,Chi_22,Chi_02)			\
+	 VMADD_MII_IR(Chi_10,one,Chi_30,Chi_10)			\
+	 VMADD_MII_IR(Chi_11,one,Chi_31,Chi_11)			\
+	 VMADD_MII_IR(Chi_12,one,Chi_32,Chi_12)			\
+							);	\
+  }
+	    /*Gt
+	     *  0 0  1  0 [0]+-[2]
+	     *  0 0  0  1 [1]+-[3]
+	     *  1 0  0  0
+	     *  0 1  0  0
+	     */
+#define TP_PROJMEM(base) {  \
+    LOAD_CHIMU(base);						\
+    asm (							\
+	 VADD(Chi_00,Chi_00,Chi_20)				\
+	 VADD(Chi_01,Chi_01,Chi_21)				\
+	 VADD(Chi_02,Chi_02,Chi_22)				\
+	 VADD(Chi_10,Chi_10,Chi_30)				\
+	 VADD(Chi_11,Chi_11,Chi_31)				\
+	 VADD(Chi_12,Chi_12,Chi_32)				\
+							);	\
+  }
+
+#define TM_PROJMEM(base) {  \
+    LOAD_CHIMU(base);						\
+    asm (							\
+	 VSUB(Chi_00,Chi_00,Chi_20)				\
+	 VSUB(Chi_01,Chi_01,Chi_21)				\
+	 VSUB(Chi_02,Chi_02,Chi_22)				\
+	 VSUB(Chi_10,Chi_10,Chi_30)				\
+	 VSUB(Chi_11,Chi_11,Chi_31)				\
+	 VSUB(Chi_12,Chi_12,Chi_32)				\
+							);	\
+  }
+
+/*
+      fspin(0)=hspin(0);
+      fspin(1)=hspin(1);
+      fspin(2)=timesMinusI(hspin(1));
+      fspin(3)=timesMinusI(hspin(0));
+
+      fspin(0)+=hspin(0);
+      fspin(1)+=hspin(1);
+      fspin(2)-=timesI(hspin(1));
+      fspin(3)-=timesI(hspin(0));
+ */
+#define XP_RECON {				\
+    asm(\
+	VONE(one)\
+	VMOV(psi_00,UChi_00) 	VMOV(psi_01,UChi_01)	VMOV(psi_02,UChi_02)\
+	VMOV(psi_10,UChi_10) 	VMOV(psi_11,UChi_11)	VMOV(psi_12,UChi_12)\
+	VZERO(psi_20)	VZERO(psi_21)	VZERO(psi_22) \
+	VZERO(psi_30) 	VZERO(psi_31)   VZERO(psi_32) \
+	VMADD_II_MIR(psi_20,one,UChi_10,psi_20)	      \
+	VMADD_II_MIR(psi_21,one,UChi_11,psi_21)	      \
+	VMADD_II_MIR(psi_22,one,UChi_12,psi_22)	      \
+	VMADD_II_MIR(psi_30,one,UChi_00,psi_30)	      \
+	VMADD_II_MIR(psi_31,one,UChi_01,psi_31)	      \
+	VMADD_II_MIR(psi_32,one,UChi_02,psi_32)	      \
+	);		     \
+  }
+
+#define XM_RECON {				\
+    asm(\
+	VONE(one)\
+	VMOV(psi_00,UChi_00) 	VMOV(psi_01,UChi_01)	VMOV(psi_02,UChi_02)\
+	VMOV(psi_10,UChi_10) 	VMOV(psi_11,UChi_11)	VMOV(psi_12,UChi_12)\
+	VZERO(psi_20)	VZERO(psi_21)	VZERO(psi_22) \
+	VZERO(psi_30) 	VZERO(psi_31)   VZERO(psi_32) \
+	VMADD_MII_IR(psi_20,one,UChi_10,psi_20)	      \
+	VMADD_MII_IR(psi_21,one,UChi_11,psi_21)	      \
+	VMADD_MII_IR(psi_22,one,UChi_12,psi_22)	      \
+	VMADD_MII_IR(psi_30,one,UChi_00,psi_30)	      \
+	VMADD_MII_IR(psi_31,one,UChi_01,psi_31)	      \
+	VMADD_MII_IR(psi_32,one,UChi_02,psi_32)	      \
+	);		     \
+  }
+
+#define XP_RECON_ACCUM {				\
+    asm(\
+	VONE(one)\
+	VADD(psi_00,psi_00,UChi_00) 	VADD(psi_01,psi_01,UChi_01)	VADD(psi_02,psi_02,UChi_02) \
+	VADD(psi_10,psi_10,UChi_10) 	VADD(psi_11,psi_11,UChi_11)	VADD(psi_12,psi_12,UChi_12) \
+	VMADD_II_MIR(psi_20,one,UChi_10,psi_20)	      \
+	VMADD_II_MIR(psi_21,one,UChi_11,psi_21)	      \
+	VMADD_II_MIR(psi_22,one,UChi_12,psi_22)	      \
+	VMADD_II_MIR(psi_30,one,UChi_00,psi_30)	      \
+	VMADD_II_MIR(psi_31,one,UChi_01,psi_31)	      \
+	VMADD_II_MIR(psi_32,one,UChi_02,psi_32)	      \
+	);		     \
+  }
+
+#define XM_RECON_ACCUM {				\
+    asm(\
+	VONE(one)\
+	VADD(psi_00,psi_00,UChi_00) 	VADD(psi_01,psi_01,UChi_01)	VADD(psi_02,psi_02,UChi_02) \
+	VADD(psi_10,psi_10,UChi_10) 	VADD(psi_11,psi_11,UChi_11)	VADD(psi_12,psi_12,UChi_12) \
+	VMADD_MII_IR(psi_20,one,UChi_10,psi_20)	      \
+	VMADD_MII_IR(psi_21,one,UChi_11,psi_21)	      \
+	VMADD_MII_IR(psi_22,one,UChi_12,psi_22)	      \
+	VMADD_MII_IR(psi_30,one,UChi_00,psi_30)	      \
+	VMADD_MII_IR(psi_31,one,UChi_01,psi_31)	      \
+	VMADD_MII_IR(psi_32,one,UChi_02,psi_32)	      \
+	);		     \
+  }
+
+//      fspin(2)+=hspin(1);
+//      fspin(3)-=hspin(0);
+#define YP_RECON_ACCUM {\
+    asm(\
+	VADD(psi_00,psi_00,UChi_00) 	VADD(psi_01,psi_01,UChi_01)	VADD(psi_02,psi_02,UChi_02) \
+	VADD(psi_10,psi_10,UChi_10) 	VADD(psi_11,psi_11,UChi_11)	VADD(psi_12,psi_12,UChi_12) \
+	VADD(psi_20,psi_20,UChi_10) 	VADD(psi_21,psi_21,UChi_11)	VADD(psi_22,psi_22,UChi_12) \
+	VSUB(psi_30,psi_30,UChi_00) 	VSUB(psi_31,psi_31,UChi_01)	VSUB(psi_32,psi_32,UChi_02) \
+	);\
+ }
+#define YM_RECON_ACCUM {\
+    asm(\
+	VADD(psi_00,psi_00,UChi_00) 	VADD(psi_01,psi_01,UChi_01)	VADD(psi_02,psi_02,UChi_02) \
+	VADD(psi_10,psi_10,UChi_10) 	VADD(psi_11,psi_11,UChi_11)	VADD(psi_12,psi_12,UChi_12) \
+	VSUB(psi_20,psi_20,UChi_10) 	VSUB(psi_21,psi_21,UChi_11)	VSUB(psi_22,psi_22,UChi_12) \
+	VADD(psi_30,psi_30,UChi_00) 	VADD(psi_31,psi_31,UChi_01)	VADD(psi_32,psi_32,UChi_02) \
+	);\
+ }
+
+//      fspin(2)-=timesI(hspin(0));
+//      fspin(3)+=timesI(hspin(1));
+#define ZP_RECON_ACCUM {\
+    asm(\
+	VONE(one)\
+	VADD(psi_00,psi_00,UChi_00) 	VADD(psi_01,psi_01,UChi_01)	VADD(psi_02,psi_02,UChi_02) \
+	VADD(psi_10,psi_10,UChi_10) 	VADD(psi_11,psi_11,UChi_11)	VADD(psi_12,psi_12,UChi_12) \
+	VMADD_II_MIR(psi_20,one,UChi_00,psi_20)				\
+	VMADD_II_MIR(psi_21,one,UChi_01,psi_21)				\
+	VMADD_II_MIR(psi_22,one,UChi_02,psi_22)				\
+	VMADD_MII_IR(psi_30,one,UChi_10,psi_30)				\
+	VMADD_MII_IR(psi_31,one,UChi_11,psi_31)				\
+	VMADD_MII_IR(psi_32,one,UChi_12,psi_32)				\
+	);\
+ }
+
+#define ZM_RECON_ACCUM {\
+    asm(\
+	VONE(one)\
+	VADD(psi_00,psi_00,UChi_00) 	VADD(psi_01,psi_01,UChi_01)	VADD(psi_02,psi_02,UChi_02) \
+	VADD(psi_10,psi_10,UChi_10) 	VADD(psi_11,psi_11,UChi_11)	VADD(psi_12,psi_12,UChi_12) \
+	VMADD_MII_IR(psi_20,one,UChi_00,psi_20)				\
+	VMADD_MII_IR(psi_21,one,UChi_01,psi_21)				\
+	VMADD_MII_IR(psi_22,one,UChi_02,psi_22)				\
+	VMADD_II_MIR(psi_30,one,UChi_10,psi_30)				\
+	VMADD_II_MIR(psi_31,one,UChi_11,psi_31)				\
+	VMADD_II_MIR(psi_32,one,UChi_12,psi_32)				\
+	);\
+ }
+
+//      fspin(2)+=hspin(0);
+//      fspin(3)+=hspin(1);
+#define TP_RECON_ACCUM {\
+    asm(\
+	VADD(psi_00,psi_00,UChi_00) 	VADD(psi_01,psi_01,UChi_01)	VADD(psi_02,psi_02,UChi_02) \
+	VADD(psi_10,psi_10,UChi_10) 	VADD(psi_11,psi_11,UChi_11)	VADD(psi_12,psi_12,UChi_12) \
+	VADD(psi_20,psi_20,UChi_00) 	VADD(psi_21,psi_21,UChi_01)	VADD(psi_22,psi_22,UChi_02) \
+	VADD(psi_30,psi_30,UChi_10) 	VADD(psi_31,psi_31,UChi_11)	VADD(psi_32,psi_32,UChi_12) \
+	);\
+ }
+
+#define TM_RECON_ACCUM {\
+    asm(\
+	VADD(psi_00,psi_00,UChi_00) 	VADD(psi_01,psi_01,UChi_01)	VADD(psi_02,psi_02,UChi_02) \
+	VADD(psi_10,psi_10,UChi_10) 	VADD(psi_11,psi_11,UChi_11)	VADD(psi_12,psi_12,UChi_12) \
+	VSUB(psi_20,psi_20,UChi_00) 	VSUB(psi_21,psi_21,UChi_01)	VSUB(psi_22,psi_22,UChi_02) \
+	VSUB(psi_30,psi_30,UChi_10) 	VSUB(psi_31,psi_31,UChi_11)	VSUB(psi_32,psi_32,UChi_12) \
+	);\
+ }
+
+
+#define ADD_RESULTi(PTR,pf)						\
+  LOAD_CHIMU(PTR)							\
+  asm(									\
+  VADD(psi_00,chi_00,psi_00)  VADD(psi_01,chi_01,psi_01)  VADD(psi_02,chi_02,psi_02) \
+  VADD(psi_10,chi_10,psi_10)  VADD(psi_11,chi_11,psi_11)  VADD(psi_12,chi_12,psi_12) \
+  VADD(psi_20,chi_20,psi_20)  VADD(psi_21,chi_21,psi_21)  VADD(psi_22,chi_22,psi_22) \
+  VADD(psi_30,chi_30,psi_30)  VADD(psi_31,chi_31,psi_31)  VADD(psi_32,chi_32,psi_32) ); \
+  SAVE_RESULT(PTR,pf);
+
+
+#define PERMUTE_DIR3
+#define PERMUTE_DIR2
+#define PERMUTE_DIR1
+
+#define PERMUTE_DIR0 {							\
+    asm(								\
+	VPERMI(perm_reg)							\
+	VPERM(Chi_00,perm_reg)   VPERM(Chi_01,perm_reg)   VPERM(Chi_02,perm_reg)	\
+	VPERM(Chi_10,perm_reg)   VPERM(Chi_11,perm_reg)   VPERM(Chi_12,perm_reg) );	\
+  }
+
+#endif
--- a/Grid/simd/IBM_qpx_double.h
+++ b/Grid/simd/IBM_qpx_double.h
@@ -0,0 +1,46 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+// No guard; ok multi-include
+#undef VSIZE
+#undef VLOAD
+#undef VLOADu
+#undef VSPLAT
+#undef VSTORE
+#undef VSTOREu
+#undef MULT_2SPIN_QPX_LS
+#undef MULT_2SPIN_QPX
+
+#define VSIZE VSIZEd
+#define VLOAD(A,B,C)     VLOADd(A,B,C)
+#define VLOADu(A,B,C)    VLOADud(A,B,C)
+#define VSPLAT(A,B,DEST) VSPLATd(A,B,DEST)
+#define VSTORE(A,B,C)    VSTOREd(A,B,C)
+#define VSTOREu(A,B,C)   VSTOREud(A,B,C)
+#define MULT_2SPIN_QPX_LS(ptr,p) MULT_2SPIN_QPX_LSd(ptr,p)
+#define MULT_2SPIN_QPX(ptr,p)    MULT_2SPIN_QPXd(ptr,p)
+
--- a/Grid/simd/IBM_qpx_single.h
+++ b/Grid/simd/IBM_qpx_single.h
@@ -0,0 +1,46 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+// No guard; ok multi-include
+#undef VSIZE
+#undef VLOAD
+#undef VLOADu
+#undef VSPLAT
+#undef VSTORE
+#undef VSTOREu
+#undef MULT_2SPIN_QPX_LS
+#undef MULT_2SPIN_QPX
+
+#define VSIZE VSIZEf
+#define VLOAD(A,B,C)     VLOADf(A,B,C)
+#define VLOADu(A,B,C)    VLOADuf(A,B,C)
+#define VSPLAT(A,B,DEST) VSPLATf(A,B,DEST)
+#define VSTORE(A,B,C)    VSTOREf(A,B,C)
+#define VSTOREu(A,B,C)   VSTOREuf(A,B,C)
+#define MULT_2SPIN_QPX_LS(ptr,p) MULT_2SPIN_QPX_LSf(ptr,p)
+#define MULT_2SPIN_QPX(ptr,p)    MULT_2SPIN_QPXf(ptr,p)
+
--- a/Grid/simd/Intel512avx.h
+++ b/Grid/simd/Intel512avx.h
@@ -0,0 +1,205 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_AV512_H
+#define GRID_ASM_AV512_H
+
+////////////////////////////////////////////////////////////	  
+// Knights Landing specials
+////////////////////////////////////////////////////////////	  
+
+#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
+#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
+
+#define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
+#define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
+
+#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
+#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
+
+#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
+#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
+
+#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMULMEMf(O,P,B,Biirr) \
+  VMULMEMf(O,P,C,Ciirr) \
+  VMULf(tmp,B,Briir) \
+  VMULf(tmp,C,Criir)
+
+#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMd(O,P,tmp)  \
+  VMULMEMd(O,P,B,Biirr)  \
+  VMULMEMd(O,P,C,Ciirr)  \
+  VMULd(tmp,B,Briir)  \
+  VMULd(tmp,C,Criir) 
+
+#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMADDMEMf(O,P,B,Biirr) \
+  VMADDMEMf(O,P,C,Ciirr) \
+  VMADDf(tmp,B,Briir) \
+  VMADDf(tmp,C,Criir)
+
+#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
+  VSHUFMEMd(O,P,tmp) \
+  VMADDMEMd(O,P,B,Biirr) \
+  VMADDMEMd(O,P,C,Ciirr) \
+  VMADDd(tmp,B,Briir) \
+  VMADDd(tmp,C,Criir)
+
+// Merges accumulation for complex dot chain; less efficient under avx512
+#define ZEND1f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Criir "," #Criir "," #tmp   ";\n"\
+                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
+
+#define ZEND2f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp   ";\n"\
+                                  "vsubps  " #tmp "," #Ciirr "," #Criir"{%k7}"  ";\n"
+
+#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\
+                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
+
+#define ZEND2d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp   ";\n"\
+                         	  "vsubpd  " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
+
+#define VMOVRDUPd(OFF,A,DEST)       "vpshufd  $0x44," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
+#define VMOVIDUPd(OFF,A,DEST)       "vpshufd  $0xee," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 3,2,3,2
+#define VMOVRDUPf(OFF,PTR,DEST)         "vmovsldup " #OFF "*64(" #PTR "), " #DEST  ";\n"
+#define VMOVIDUPf(OFF,PTR,DEST)         "vmovshdup " #OFF "*64(" #PTR "), " #DEST  ";\n"
+
+#define VRDUPd(SRC,DEST)       "vpshufd  $0x44," #SRC"," #DEST  ";\n" // 32 bit level: 1,0,3,2
+#define VRDUPf(SRC,DEST)         "vmovsldup " #SRC ", " #DEST  ";\n"
+#define VIDUPd(SRC,DEST)       "vpshufd  $0xee," #SRC"," #DEST  ";\n" // 32 bit level: 3,2,3,2
+#define VIDUPf(SRC,DEST)         "vmovshdup " #SRC ", " #DEST  ";\n"
+
+#define VBCASTRDUPd(OFF,A,DEST)           "vbroadcastsd    (" #OFF "*16+0)(" #A ")," #DEST  ";\n" 
+#define VBCASTIDUPd(OFF,A,DEST)           "vbroadcastsd    (" #OFF "*16+8)(" #A ")," #DEST  ";\n" 
+#define VBCASTRDUPf(OFF,PTR,DEST)         "vbroadcastss    (" #OFF "*8 +0)(" #PTR "), " #DEST  ";\n"
+#define VBCASTIDUPf(OFF,PTR,DEST)         "vbroadcastss    (" #OFF "*8 +4)(" #PTR "), " #DEST  ";\n"
+#define VBCASTCDUPf(OFF,A,DEST)           "vbroadcastsd    (" #OFF "*64  )(" #A ")," #DEST  ";\n" 
+#define VBCASTZDUPf(OFF,A,DEST)           "vbroadcastf32x4 (" #OFF "*64  )(" #A ")," #DEST  ";\n" 
+#define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST) 
+#define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST) 
+
+#define VMADDSUBf(A,B,accum) "vfmaddsub231ps   " #A "," #B "," #accum  ";\n"
+#define VMADDSUBd(A,B,accum) "vfmaddsub231pd   " #A "," #B "," #accum  ";\n"
+#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps   " #O"*64("#P "),"#B "," #accum  ";\n"
+#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd   " #O"*64("#P "),"#B "," #accum  ";\n"
+
+
+#define VMADDRDUPf(O,P,B,accum) "vfmadd231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
+#define VMADDIDUPf(O,P,B,accum) "vfmadd231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
+#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
+#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
+#define VMULRDUPf(O,P,B,accum) "vmulps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
+#define VMULIDUPf(O,P,B,accum) "vmulps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
+
+#define VMADDRDUPd(O,P,B,accum) "vfmadd231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
+#define VMADDIDUPd(O,P,B,accum) "vfmadd231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
+#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
+#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
+#define VMULRDUPd(O,P,B,accum) "vmulpd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
+#define VMULIDUPd(O,P,B,accum) "vmulpd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
+  /*
+   * TimesI is used only in the XP recon
+   * Could zero the regs and use RECON_ACCUM
+   */
+
+#define VTIMESI0f(A,DEST, Z)   VSHUFf(A,DEST)	  
+#define VTIMESI1f(A,DEST, Z)   "vaddps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
+#define VTIMESI2f(A,DEST, Z)   "vsubps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
+
+#define VTIMESI0d(A,DEST, Z)   VSHUFd(A,DEST)	 
+#define VTIMESI1d(A,DEST, Z)   "vaddpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
+#define VTIMESI2d(A,DEST, Z)   "vsubpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
+
+#define VTIMESMINUSI0f(A,DEST,Z)  VSHUFf(A,DEST)					
+#define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
+#define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
+
+#define VTIMESMINUSI0d(A,DEST,Z)  VSHUFd(A,DEST)					
+#define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
+#define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
+
+#if 0
+
+#define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
+#define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
+#define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
+
+#define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
+#define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
+#define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
+
+#define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
+#define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
+#define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
+
+#define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
+#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
+#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
+
+#else
+
+// o_p must point to floating 1.0f/d
+//
+// Ai, Ar -> tmp (r i)
+// tmp *1.0 
+// ACC i - Ar ; ACC r + Ai
+#define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
+#define VACCTIMESMINUSI1f(A,ACC,tmp)  VMADDMEMf(1,%r10,tmp,ACC)
+#define VACCTIMESMINUSI2f(A,ACC,tmp)  
+
+
+#define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
+#define VACCTIMESMINUSI1d(A,ACC,tmp)  VMADDMEMd(1,%r10,tmp,ACC)  
+#define VACCTIMESMINUSI2d(A,ACC,tmp)
+
+// Ai, Ar -> tmp (r i)
+// tmp *1.0 
+// ACC i + Ar ; ACC r - Ai
+#define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
+#define  VACCTIMESI1f(A,ACC,tmp)  VMADDMEMf(0,%r10,tmp,ACC)  
+#define  VACCTIMESI2f(A,ACC,tmp)
+
+#define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
+#define  VACCTIMESI1d(A,ACC,tmp)  VMADDMEMd(0,%r10,tmp,ACC)  
+#define  VACCTIMESI2d(A,ACC,tmp)
+
+#endif
+
+#define VPERM0f(A,B) "vshuff32x4  $0x4e," #A "," #B "," #B ";\n"
+#define VPERM1f(A,B) "vshuff32x4  $0xb1," #A "," #B "," #B ";\n"
+#define VPERM2f(A,B) "vshufps     $0x4e," #A "," #B "," #B ";\n"
+#define VPERM3f(A,B) "vshufps     $0xb1," #A "," #B "," #B ";\n"
+
+#define VPERM0d(A,B) "vshuff64x2  $0x4e," #A "," #B "," #B ";\n"
+#define VPERM1d(A,B) "vshuff64x2  $0xb1," #A "," #B "," #B ";\n"
+#define VPERM2d(A,B) "vshufpd     $0x55," #A "," #B "," #B ";\n"
+#define VPERM3d(A,B) VMOVd(A,B)
+
+
+#endif
--- a/Grid/simd/Intel512common.h
+++ b/Grid/simd/Intel512common.h
@@ -0,0 +1,159 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_INTEL_COMMON_512_H
+#define GRID_ASM_INTEL_COMMON_512_H
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Peformance options
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#undef  AVX512_PF_L2_WRITE
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Opcodes common 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define MASK_REGS \
+  __asm__ ("mov     $0xAAAA, %%eax \n"\
+           "kmovw    %%eax, %%k6 \n"\
+           "mov     $0x5555, %%eax \n"\
+           "kmovw    %%eax, %%k7 \n" : : : "%eax");
+
+//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" );
+
+#define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
+#define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
+
+#define VTIMESIf(A,DEST, Z) \
+  VTIMESI0f(A,DEST, Z) \
+  VTIMESI1f(A,DEST, Z) \
+  VTIMESI2f(A,DEST, Z) 
+
+#define VTIMESId(A,DEST, Z) \
+  VTIMESI0d(A,DEST, Z) \
+  VTIMESI1d(A,DEST, Z) \
+  VTIMESI2d(A,DEST, Z) 
+
+#define VTIMESMINUSIf(A,DEST, Z) \
+        VTIMESMINUSI0f(A,DEST, Z) \
+        VTIMESMINUSI1f(A,DEST, Z) \
+        VTIMESMINUSI2f(A,DEST, Z) 
+
+#define VTIMESMINUSId(A,DEST, Z) \
+        VTIMESMINUSI0d(A,DEST, Z) \
+        VTIMESMINUSI1d(A,DEST, Z) \
+        VTIMESMINUSI2d(A,DEST, Z) 
+
+#define VACCTIMESIf(A,ACC,tmp)			\
+ VACCTIMESI0f(A,ACC,tmp)			\
+ VACCTIMESI1f(A,ACC,tmp)			\
+ VACCTIMESI2f(A,ACC,tmp)			
+
+#define VACCTIMESId(A,ACC,tmp)			\
+ VACCTIMESI0d(A,ACC,tmp)			\
+ VACCTIMESI1d(A,ACC,tmp)			\
+ VACCTIMESI2d(A,ACC,tmp)			
+
+#define VACCTIMESMINUSIf(A,ACC,tmp)			\
+  VACCTIMESMINUSI0f(A,ACC,tmp)				\
+  VACCTIMESMINUSI1f(A,ACC,tmp)				\
+  VACCTIMESMINUSI2f(A,ACC,tmp)			
+
+#define VACCTIMESMINUSId(A,ACC,tmp)			\
+  VACCTIMESMINUSI0d(A,ACC,tmp)				\
+  VACCTIMESMINUSI1d(A,ACC,tmp)				\
+  VACCTIMESMINUSI2d(A,ACC,tmp)			
+
+#define LOAD64a(A,ptr)  "movq %0, %" #A :  : "r"(ptr)  : #A  
+#define LOAD64i(A,ptr)  __asm__ ( LOAD64a(A,ptr));
+#define LOAD64(A,ptr)  LOAD64i(A,ptr)
+
+#define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
+#define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
+
+#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n" 
+#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
+#ifdef AVX512_PF_L2_WRITE
+#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
+#else
+#define VPREFETCHW(O,A) 
+#endif
+#define VPREFETCHNTA(O,A) 
+#define VPREFETCH(O,A)    
+
+#define VEVICT(O,A)   
+
+//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
+//  "clevict0 "#O"*64("#A");\n" 
+
+#define VLOADf(OFF,PTR,DEST)   "vmovups  " #OFF "*64(" #PTR "), " #DEST  ";\n"
+#define VLOADd(OFF,PTR,DEST)   "vmovupd  " #OFF "*64(" #PTR "), " #DEST  ";\n"
+
+#define VADDf(A,B,DEST)        "vaddps   " #A "," #B "," #DEST  ";\n"
+#define VADDd(A,B,DEST)        "vaddpd   " #A "," #B "," #DEST  ";\n"
+
+#define VSUBf(A,B,DEST)        "vsubps   " #A "," #B "," #DEST  ";\n"
+#define VSUBd(A,B,DEST)        "vsubpd   " #A "," #B "," #DEST  ";\n"
+
+#define VADDMEMf(O,A,B,DEST)        "vaddps   "#O"*64("#A ")," #B "," #DEST  ";\n"
+#define VADDMEMd(O,A,B,DEST)        "vaddpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
+
+#define VSUBMEMf(O,A,B,DEST)        "vsubps   "#O"*64("#A ")," #B "," #DEST  ";\n"
+#define VSUBMEMd(O,A,B,DEST)        "vsubpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
+
+#define VMULf(A,B,DEST)        "vmulps   " #A "," #B "," #DEST  ";\n"
+#define VMULd(A,B,DEST)        "vmulpd   " #A "," #B "," #DEST  ";\n"
+
+#define VMADDf(A,B,DEST)       "vfmadd231ps   " #A "," #B "," #DEST  ";\n"
+#define VMADDd(A,B,DEST)       "vfmadd231pd   " #A "," #B "," #DEST  ";\n"
+
+#define VMULMEMf(O,A,B,DEST)   "vmulps   " #O"*64("#A ")," #B "," #DEST  ";\n"
+#define VMULMEMd(O,A,B,DEST)   "vmulpd   " #O"*64("#A ")," #B "," #DEST  ";\n"
+
+#define VMADDMEMf(O,A,B,DEST)       "vfmadd231ps   " #O"*64("#A "),"#B "," #DEST  ";\n"
+#define VMADDMEMd(O,A,B,DEST)       "vfmadd231pd   " #O"*64("#A "),"#B "," #DEST  ";\n"
+
+#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
+#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
+
+#define STREAM_STORE
+#ifdef STREAM_STORE
+#define VSTOREf(OFF,PTR,SRC)   "vmovntps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#define VSTOREd(OFF,PTR,SRC)   "vmovntpd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#else
+#define VSTOREf(OFF,PTR,SRC)   "vmovups " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#define VSTOREd(OFF,PTR,SRC)   "vmovupd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#endif
+
+// Swaps Re/Im ; could unify this with IMCI
+#define VSHUFd(A,DEST)         "vpshufd  $0x4e," #A "," #DEST  ";\n"    
+#define VSHUFf(A,DEST)         "vpshufd  $0xb1," #A "," #DEST  ";\n"    
+#define VSHUFMEMd(OFF,A,DEST)  "vpshufd  $0x4e, " #OFF"*64("#A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
+#define VSHUFMEMf(OFF,A,DEST)  "vpshufd  $0xb1, " #OFF"*64("#A ")," #DEST  ";\n" // 32 bit level: 2,3,0,1
+
+#define TRAP " int3 ;\n"
+
+#endif
--- a/Grid/simd/Intel512double.h
+++ b/Grid/simd/Intel512double.h
@@ -0,0 +1,156 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+// No guard can be multiply included as undef clearage
+#undef VZERO
+#undef VMOV
+#undef VLOAD
+#undef VSTORE
+#define VZERO(A)                  VZEROd(A)
+#define VMOV(A,B)                 VMOVd(A,B)
+#define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
+#define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
+
+#undef VADD
+#undef VSUB
+#undef VMUL
+#undef VMADD
+#define VADD(A,B,C)               VADDd(A,B,C)
+#define VSUB(A,B,C)               VSUBd(A,B,C)
+#define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
+#define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
+
+
+#undef VTIMESI
+#undef VTIMESI0 
+#undef VTIMESI1
+#undef VTIMESI2 
+#define VTIMESI(A,B,C)                 VTIMESId(A,B,C)
+#define VTIMESI0(A,B,C)                VTIMESI0d(A,B,C)
+#define VTIMESI1(A,B,C)                VTIMESI1d(A,B,C)
+#define VTIMESI2(A,B,C)                VTIMESI2d(A,B,C)
+
+#undef VTIMESMINUSI
+#undef VTIMESMINUSI0
+#undef VTIMESMINUSI1
+#undef VTIMESMINUSI2
+#define VTIMESMINUSI(A,B,C)            VTIMESMINUSId(A,B,C)
+#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0d(A,B,C)
+#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1d(A,B,C)
+#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2d(A,B,C)
+
+#undef VACCTIMESI
+#undef VACCTIMESI0
+#undef VACCTIMESI1
+#undef VACCTIMESI2
+#define VACCTIMESI(A,B,C)         VACCTIMESId(A,B,C)
+#define VACCTIMESI0(A,B,C)             VACCTIMESI0d(A,B,C)
+#define VACCTIMESI1(A,B,C)             VACCTIMESI1d(A,B,C)
+#define VACCTIMESI2(A,B,C)             VACCTIMESI2d(A,B,C)
+
+#undef VACCTIMESMINUSI
+#undef VACCTIMESMINUSI0
+#undef VACCTIMESMINUSI1
+#undef VACCTIMESMINUSI2
+#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSId(A,B,C)
+#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0d(A,B,C)
+#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1d(A,B,C)
+#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2d(A,B,C)
+
+#undef VACCTIMESI1MEM
+#undef VACCTIMESI2MEM
+#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMd(A,ACC,O,P)
+#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMd(A,ACC,O,P)
+
+#undef VACCTIMESMINUSI1MEM
+#undef VACCTIMESMINUSI2MEM
+#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
+#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
+
+#undef VPERM0
+#undef VPERM1
+#undef VPERM2
+#undef VPERM3
+#define VPERM0(A,B)               VPERM0d(A,B)
+#define VPERM1(A,B)               VPERM1d(A,B)
+#define VPERM2(A,B)               VPERM2d(A,B)
+#define VPERM3(A,B)               VPERM3d(A,B)
+
+#undef VSHUFMEM
+#undef VADDMEM
+#undef VSUBMEM
+#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMd(OFF,A,DEST)
+#define VADDMEM(O,A,B,C)                                 VADDMEMd(O,A,B,C)
+#define VSUBMEM(O,A,B,C)                                 VSUBMEMd(O,A,B,C)
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef VMADDSUB
+#undef VSHUF
+#define VMOVIDUP(A,B,C)                                  VMOVIDUPd(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VMOVRDUPd(A,B,C)
+#define VMADDSUB(A,B,accum)                              VMADDSUBd(A,B,accum) 
+#define VSHUF(A,B)                                       VSHUFd(A,B)
+
+
+#undef ZEND1
+#undef ZEND2
+#undef ZLOAD
+#undef ZMUL
+#undef ZMADD
+#undef ZMULMEM2SP
+#undef ZMADDMEM2SP
+
+#define ZEND1(A,B,C)                                     ZEND1d(A,B,C)
+#define ZEND2(A,B,C)                                     ZEND2d(A,B,C)
+#define ZLOAD(A,B,C,D)                                   ZLOADd(A,B,C,D)
+#define ZMUL(A,B,C,D,E)                                  ZMULd(A,B,C,D,E)
+#define ZMADD(A,B,C,D,E)                                 ZMADDd(A,B,C,D,E)
+#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+
+
+#undef VRDUP
+#undef VIDUP
+#undef VMADDSUBMEM
+#undef VMADDMEM
+#undef VMULMEM
+#define VRDUP(SRC,DEST) VRDUPd(SRC,DEST) 
+#define VIDUP(SRC,DEST) VIDUPd(SRC,DEST) 
+#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
+#define VMADDMEM(O,P,B,accum)    VMADDMEMd(O,P,B,accum)
+#define VMULMEM(O,P,B,accum)     VMULMEMd(O,P,B,accum)
+#undef VMADDRDUP   
+#undef VMADDSUBRDUP   
+#undef VMADDSUBIDUP   
+#undef VMULRDUP   
+#undef VMULIDUP   
+#define VMADDRDUP(O,P,B,accum)    VMADDRDUPd(O,P,B,accum) 
+#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) 
+#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) 
+#define VMULRDUP(O,P,B,accum)     VMULRDUPd(O,P,B,accum)      
+#define VMULIDUP(O,P,B,accum)     VMULIDUPd(O,P,B,accum) 
--- a/Grid/simd/Intel512imci.h
+++ b/Grid/simd/Intel512imci.h
@@ -0,0 +1,127 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_AV512_H
+#define GRID_ASM_AV512_H
+
+////////////////////////////////////////////////////////////	  
+// Knights Corner specials
+////////////////////////////////////////////////////////////	  
+
+#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
+#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
+
+#define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
+#define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
+
+#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
+#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
+
+#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
+#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
+
+#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMULMEMf(O,P,B,Biirr) \
+  VMULMEMf(O,P,C,Ciirr) \
+  VMULf(tmp,B,Briir) \
+  VMULf(tmp,C,Criir)
+
+#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMd(O,P,tmp)  \
+  VMULMEMd(O,P,B,Biirr)  \ 
+  VMULMEMd(O,P,C,Ciirr)  \
+  VMULd(tmp,B,Briir)  \
+  VMULd(tmp,C,Criir) 
+
+#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMADDMEMf(O,P,B,Biirr) \
+  VMADDMEMf(O,P,C,Ciirr) \
+  VMADDf(tmp,B,Briir) \
+  VMADDf(tmp,C,Criir)
+
+#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
+  VSHUFMEMd(O,P,tmp) \
+  VMADDMEMd(O,P,B,Biirr) \
+  VMADDMEMd(O,P,C,Ciirr) \
+  VMADDd(tmp,B,Briir) \
+  VMADDd(tmp,C,Criir)
+
+#define ZEND1d(Criir,Ciirr, tmp) "vaddpd  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
+#define ZEND2d(Criir,Ciirr, tmp) "vsubpd  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
+
+#define ZEND1f(Criir,Ciirr, tmp) "vaddps  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
+#define ZEND2f(Criir,Ciirr, tmp) "vsubps  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
+
+#define VTIMESI0f(A,DEST, Z)   
+#define VTIMESI1f(A,DEST, Z)   "vaddps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
+#define VTIMESI2f(A,DEST, Z)   "vsubps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
+
+#define VTIMESI0d(A,DEST, Z)   
+#define VTIMESI1d(A,DEST, Z)   "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
+#define VTIMESI2d(A,DEST, Z)   "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
+
+#define VTIMESMINUSI0f(A,DEST,Z)  
+#define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
+#define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
+
+#define VTIMESMINUSI0d(A,DEST,Z)  
+#define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
+#define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
+
+#define  VACCTIMESI0f(A,ACC,tmp)
+#define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
+#define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
+
+#define  VACCTIMESI0d(A,ACC,tmp)
+#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
+#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
+
+#define VACCTIMESMINUSI0f(A,ACC,tmp)  
+#define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
+#define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
+
+	   // Acc = Acc - i A
+#define VACCTIMESMINUSI0d(A,ACC,tmp)  
+#define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
+#define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
+
+//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
+//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
+
+#define VPERM0f(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
+#define VPERM1f(A,B) "vpermf32x4  $0xb1," #A "," #B ";\n"
+#define VPERM2f(A,B) "vmovaps     " #A "{badc}," #B ";\n"
+#define VPERM3f(A,B) "vmovaps     " #A "{cdab}," #B ";\n"
+
+#define VPERM0d(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
+#define VPERM1d(A,B) "vmovapd     " #A "{badc}," #B ";\n"
+#define VPERM2d(A,B) "vmovapd     " #A "{cdab}," #B ";\n"
+#define VPERM3d(A,B) VMOVd(A,B)
+
+#endif
--- a/Grid/simd/Intel512single.h
+++ b/Grid/simd/Intel512single.h
@@ -0,0 +1,157 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+// No guard can be multiply included as undef clearge of macros
+#undef VZERO
+#undef VMOV
+#undef VLOAD
+#undef VSTORE
+#define VZERO(A)                  VZEROf(A)
+#define VMOV(A,B)                 VMOVf(A,B)
+#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
+#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
+
+#undef VADD
+#undef VSUB
+#undef VMUL
+#undef VMADD
+#define VADD(A,B,C)               VADDf(A,B,C)
+#define VSUB(A,B,C)               VSUBf(A,B,C)
+#define VMUL(Uri,Uir,Chi)         VMULf(Uri,Uir,Chi)
+#define VMADD(Uri,Uir,Chi)        VMADDf(Uri,Uir,Chi)
+
+
+#undef VTIMESI
+#undef VTIMESI0 
+#undef VTIMESI1
+#undef VTIMESI2 
+#define VTIMESI(A,B,C)                 VTIMESIf(A,B,C)
+#define VTIMESI0(A,B,C)                VTIMESI0f(A,B,C)
+#define VTIMESI1(A,B,C)                VTIMESI1f(A,B,C)
+#define VTIMESI2(A,B,C)                VTIMESI2f(A,B,C)
+
+#undef VTIMESMINUSI
+#undef VTIMESMINUSI0
+#undef VTIMESMINUSI1
+#undef VTIMESMINUSI2
+#define VTIMESMINUSI(A,B,C)            VTIMESMINUSIf(A,B,C)
+#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0f(A,B,C)
+#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1f(A,B,C)
+#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2f(A,B,C)
+
+#undef VACCTIMESI
+#undef VACCTIMESI0
+#undef VACCTIMESI1
+#undef VACCTIMESI2
+#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
+#define VACCTIMESI0(A,B,C)             VACCTIMESI0f(A,B,C)
+#define VACCTIMESI1(A,B,C)             VACCTIMESI1f(A,B,C)
+#define VACCTIMESI2(A,B,C)             VACCTIMESI2f(A,B,C)
+
+#undef VACCTIMESMINUSI
+#undef VACCTIMESMINUSI0
+#undef VACCTIMESMINUSI1
+#undef VACCTIMESMINUSI2
+#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
+#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0f(A,B,C)
+#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1f(A,B,C)
+#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2f(A,B,C)
+
+#undef VACCTIMESI1MEM
+#undef VACCTIMESI2MEM
+#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMf(A,ACC,O,P)
+#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMf(A,ACC,O,P)
+
+#undef VACCTIMESMINUSI1MEM
+#undef VACCTIMESMINUSI2MEM
+#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
+#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
+
+#undef VPERM0
+#undef VPERM1
+#undef VPERM2
+#undef VPERM3
+#define VPERM0(A,B)               VPERM0f(A,B)
+#define VPERM1(A,B)               VPERM1f(A,B)
+#define VPERM2(A,B)               VPERM2f(A,B)
+#define VPERM3(A,B)               VPERM3f(A,B)
+
+#undef VSHUFMEM
+#undef VADDMEM
+#undef VSUBMEM
+#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
+#define VADDMEM(O,A,B,C)                                 VADDMEMf(O,A,B,C)
+#define VSUBMEM(O,A,B,C)                                 VSUBMEMf(O,A,B,C)
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef VMADDSUB
+#undef VSHUF
+#define VMOVIDUP(A,B,C)                                  VMOVIDUPf(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VMOVRDUPf(A,B,C)
+#define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
+#define VSHUF(A,B)                                       VSHUFf(A,B)
+
+#undef ZEND1
+#undef ZEND2
+#undef ZLOAD
+#undef ZMUL
+#undef ZMADD
+#undef ZMULMEM2SP
+#undef ZMADDMEM2SP
+
+#define ZEND1(A,B,C)                                     ZEND1f(A,B,C)
+#define ZEND2(A,B,C)                                     ZEND2f(A,B,C)
+#define ZLOAD(A,B,C,D)                                   ZLOADf(A,B,C,D)
+#define ZMUL(A,B,C,D,E)                                  ZMULf(A,B,C,D,E)
+#define ZMADD(A,B,C,D,E)                                 ZMADDf(A,B,C,D,E)
+#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+
+#undef VRDUP
+#undef VIDUP
+#undef VMADDSUBMEM
+#undef VMADDMEM
+#undef VMULMEM
+
+#define VRDUP(SRC,DEST) VRDUPf(SRC,DEST) 
+#define VIDUP(SRC,DEST) VIDUPf(SRC,DEST) 
+#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
+#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
+#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
+
+#undef VMADDRDUP   
+#undef VMADDSUBRDUP   
+#undef VMADDSUBIDUP   
+#undef VMULRDUP   
+#undef VMULIDUP   
+#define VMADDRDUP(O,P,B,accum)    VMADDRDUPf(O,P,B,accum) 
+#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) 
+#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) 
+#define VMULRDUP(O,P,B,accum)     VMULRDUPf(O,P,B,accum)      
+#define VMULIDUP(O,P,B,accum)     VMULIDUPf(O,P,B,accum) 
+   
--- a/Grid/simd/Intel512wilson.h
+++ b/Grid/simd/Intel512wilson.h
@@ -0,0 +1,938 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_INTEL_512_QCD_H
+#define GRID_ASM_INTEL_512_QCD_H
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Register allocations for Wilson Kernel are precision indept
+//////////////////////////////////////////////////////////////////////////////////////////
+#define psi_00 %zmm0 
+#define psi_01 %zmm1
+#define psi_02 %zmm2
+  
+#define psi_10 %zmm3
+#define psi_11 %zmm4
+#define psi_12 %zmm5
+
+#define psi_20 %zmm6
+#define psi_21 %zmm7
+#define psi_22 %zmm8
+
+#define psi_30 %zmm9
+#define psi_31 %zmm10
+#define psi_32 %zmm11
+
+#define Chi_00 %zmm12  
+#define Chi_01 %zmm13
+#define Chi_02 %zmm14
+
+#define Chi_10 %zmm15
+#define Chi_11 %zmm16
+#define Chi_12 %zmm17  
+
+#define UChi_00 %zmm18 
+#define UChi_01 %zmm19
+#define UChi_02 %zmm20
+
+#define UChi_10 %zmm21
+#define UChi_11 %zmm22
+#define UChi_12 %zmm23 
+
+#define Uir %zmm24 
+#define Uri %zmm25  
+#define T1 %zmm24
+#define T2 %zmm25
+
+#define Z0 %zmm26
+#define Z1 %zmm27
+#define Z2 %zmm28
+#define Z3 %zmm29
+#define Z4 %zmm30
+#define Z5 %zmm31
+
+#define TMP Chi_00
+
+#define Chimu_00 Chi_00
+#define Chimu_01 Chi_01
+#define Chimu_02 Chi_02
+#define Chimu_10 Chi_10
+#define Chimu_11 Chi_11
+#define Chimu_12 Chi_12
+#define Chimu_20 UChi_00
+#define Chimu_21 UChi_01
+#define Chimu_22 UChi_02
+#define Chimu_30 UChi_10
+#define Chimu_31 UChi_11
+#define Chimu_32 UChi_12
+
+#include "Intel512common.h"
+#include "Intel512avx.h"
+
+//////////////////////////////////////////////////////////////////
+// Macros used to build wilson kernel -- can rationalise and simplify
+// a little as some duplication developed during trying different
+// variants during optimisation. Could cut back to only those used.
+//////////////////////////////////////////////////////////////////
+#define LOCK_GAUGE(dir) 
+#define UNLOCK_GAUGE(dir) 
+
+//  const SiteSpinor * ptr = & in._odata[offset];	
+#define LOAD_CHIMU(PTR)	 LOAD64(%r8,PTR) __asm__ ( LOAD_CHIMUi );
+#define LOAD_CHI(PTR)	 LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
+#define SAVE_UCHI(PTR)	 SAVE_UCHIi(PTR)
+#define SAVE_CHI(PTR)	 SAVE_CHIi(PTR)
+#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
+#define ADD_RESULT(PT,R) ADD_RESULTi(PT,R)
+
+#define ZERO_PSI			\
+  asm( VZERO(psi_00)			\
+       VZERO(psi_01)			\
+       VZERO(psi_02)			\
+       VZERO(psi_10)			\
+       VZERO(psi_11)			\
+       VZERO(psi_12)			\
+       VZERO(psi_20)			\
+       VZERO(psi_21)			\
+       VZERO(psi_22)			\
+       VZERO(psi_30)			\
+       VZERO(psi_31)			\
+       VZERO(psi_32));
+
+#define LOAD_CHIMUi				\
+  LOAD_CHIMU01i					\
+  LOAD_CHIMU23i	
+
+#define LOAD_CHIMU01i				\
+  VLOAD(0,%r8,Chimu_00)				\
+  VLOAD(1,%r8,Chimu_01)				\
+  VLOAD(2,%r8,Chimu_02)				\
+  VLOAD(3,%r8,Chimu_10)				\
+  VLOAD(4,%r8,Chimu_11)				\
+  VLOAD(5,%r8,Chimu_12)		
+
+#define LOAD_CHIMU23i				\
+  VLOAD(6,%r8,Chimu_20)				\
+  VLOAD(7,%r8,Chimu_21)				\
+  VLOAD(8,%r8,Chimu_22)				\
+  VLOAD(9,%r8,Chimu_30)				\
+  VLOAD(10,%r8,Chimu_31)			\
+  VLOAD(11,%r8,Chimu_32)		
+
+#define SHUF_CHIMU23i\
+	   VSHUFMEM(6,%r8,Chimu_20)		\
+	   VSHUFMEM(7,%r8,Chimu_21)		\
+	   VSHUFMEM(8,%r8,Chimu_22)		\
+	   VSHUFMEM(9,%r8,Chimu_30)		\
+	   VSHUFMEM(10,%r8,Chimu_31)		\
+	   VSHUFMEM(11,%r8,Chimu_32)		
+
+#define LOAD_CHIi				\
+  VLOAD(0,%r8,Chi_00)					\
+  VLOAD(1,%r8,Chi_01)					\
+  VLOAD(2,%r8,Chi_02)					\
+  VLOAD(3,%r8,Chi_10)					\
+  VLOAD(4,%r8,Chi_11)					\
+  VLOAD(5,%r8,Chi_12)	
+
+#define SAVE_UCHIi(PTR)				\
+  LOAD64(%r8,PTR)				\
+  __asm__ (					\
+  VSTORE(0,%r8,UChi_00)				\
+  VSTORE(1,%r8,UChi_01)				\
+  VSTORE(2,%r8,UChi_02)				\
+  VSTORE(3,%r8,UChi_10)				\
+  VSTORE(4,%r8,UChi_11)				\
+  VSTORE(5,%r8,UChi_12)				);
+
+#define SAVE_CHIi(PTR)				\
+  LOAD64(%r8,PTR)				\
+  __asm__ (					\
+  VSTORE(0,%r8,Chi_00)				\
+  VSTORE(1,%r8,Chi_01)				\
+  VSTORE(2,%r8,Chi_02)				\
+  VSTORE(3,%r8,Chi_10)				\
+  VSTORE(4,%r8,Chi_11)				\
+  VSTORE(5,%r8,Chi_12)				);
+
+#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p)
+#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
+
+//////////////////////////////////////////////////////////////////
+// Dirac algebra
+//////////////////////////////////////////////////////////////////
+//      hspin(0)=fspin(0)+timesI(fspin(3));
+//      hspin(1)=fspin(1)+timesI(fspin(2));
+#define XP_PROJMEM(PTR) \
+  LOAD64(%r8,PTR)							\
+  __asm__ (								\
+	   LOAD_CHIi						\
+	   SHUF_CHIMU23i						\
+	   VACCTIMESI1(Chi_00,Chi_00,Chimu_30)		\
+	   VACCTIMESI1(Chi_01,Chi_01,Chimu_31)		\
+	   VACCTIMESI1(Chi_02,Chi_02,Chimu_32)		\
+	   VACCTIMESI1(Chi_10,Chi_10,Chimu_20)		\
+	   VACCTIMESI1(Chi_11,Chi_11,Chimu_21)		\
+	   VACCTIMESI1(Chi_12,Chi_12,Chimu_22)		\
+	   VACCTIMESI2(Chi_00,Chi_00,Chimu_30)		\
+	   VACCTIMESI2(Chi_01,Chi_01,Chimu_31)		\
+	   VACCTIMESI2(Chi_02,Chi_02,Chimu_32)		\
+	   VACCTIMESI2(Chi_10,Chi_10,Chimu_20)		\
+	   VACCTIMESI2(Chi_11,Chi_11,Chimu_21)		\
+	   VACCTIMESI2(Chi_12,Chi_12,Chimu_22)		);
+
+
+#define YP_PROJMEM(ptr) \
+  LOAD64(%r8,ptr)		\
+  __asm__ (					\
+  LOAD_CHIMU01i					\
+  VSUBMEM(9,%r8 ,Chimu_00,Chi_00)		\
+  VSUBMEM(10,%r8,Chimu_01,Chi_01)		\
+  VSUBMEM(11,%r8,Chimu_02,Chi_02)		\
+  VADDMEM(6,%r8,Chimu_10,Chi_10)		\
+  VADDMEM(7,%r8,Chimu_11,Chi_11)		\
+  VADDMEM(8,%r8,Chimu_12,Chi_12)		);
+
+#define ZP_PROJMEM(PTR) \
+  LOAD64(%r8,PTR)							\
+  __asm__ (								\
+	   LOAD_CHIi						\
+	   SHUF_CHIMU23i						\
+	   VACCTIMESI1(Chi_00,Chi_00,Chimu_20)				\
+	   VACCTIMESI1(Chi_01,Chi_01,Chimu_21)		   	        \
+	   VACCTIMESI1(Chi_02,Chi_02,Chimu_22)				\
+	   VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30)			\
+	   VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31)			\
+	   VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32)			\
+	   VACCTIMESI2(Chi_00,Chi_00,Chimu_20)				\
+	   VACCTIMESI2(Chi_01,Chi_01,Chimu_21)				\
+	   VACCTIMESI2(Chi_02,Chi_02,Chimu_22)				\
+	   VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30)		\
+	   VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31)		\
+	   VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32)	);
+
+
+#define TP_PROJMEM(ptr)				\
+  LOAD64(%r8,ptr)				\
+  __asm__ (					\
+	   LOAD_CHIMU01i			\
+	   VADDMEM(6,%r8 ,Chimu_00,Chi_00)	\
+	   VADDMEM(7,%r8,Chimu_01,Chi_01)	\
+	   VADDMEM(8,%r8,Chimu_02,Chi_02)	\
+	   VADDMEM(9,%r8,Chimu_10,Chi_10)	\
+	   VADDMEM(10,%r8,Chimu_11,Chi_11)	\
+	   VADDMEM(11,%r8,Chimu_12,Chi_12)	);
+
+//      hspin(0)=fspin(0)-timesI(fspin(3))
+//      hspin(1)=fspin(1)-timesI(fspin(2))
+#define XM_PROJMEM(PTR) \
+  LOAD64(%r8,PTR)\
+  __asm__ (								\
+	   LOAD_CHIi \
+	   SHUF_CHIMU23i						\
+	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
+	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
+	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
+	   VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\
+	   VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\
+	   VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\
+	   VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\
+	   VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\
+	   VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\
+	   VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\
+	   VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\
+	   VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
+
+#define YM_PROJMEM(ptr)				\
+  LOAD64(%r8,ptr)				\
+  __asm__ (					\
+  LOAD_CHIMU01i					\
+  VADDMEM(9,%r8 ,Chimu_00,Chi_00)		\
+  VADDMEM(10,%r8,Chimu_01,Chi_01)		\
+  VADDMEM(11,%r8,Chimu_02,Chi_02)		\
+  VSUBMEM(6,%r8,Chimu_10,Chi_10)		\
+  VSUBMEM(7,%r8,Chimu_11,Chi_11)		\
+  VSUBMEM(8,%r8,Chimu_12,Chi_12)			);
+
+#define ZM_PROJMEM(PTR) \
+  LOAD64(%r8,PTR)							\
+  __asm__ (								\
+           LOAD_CHIi \
+	   SHUF_CHIMU23i						\
+	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
+	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
+	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
+	   VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\
+	   VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\
+	   VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\
+	   VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\
+	   VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\
+	   VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\
+	   VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\
+	   VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\
+	   VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
+
+#define TM_PROJMEM(ptr)				\
+  LOAD64(%r8,ptr)				\
+  __asm__ (					\
+  LOAD_CHIMU01i					\
+  VSUBMEM(6,%r8,Chimu_00,Chi_00)		\
+  VSUBMEM(7,%r8,Chimu_01,Chi_01)		\
+  VSUBMEM(8,%r8,Chimu_02,Chi_02)		\
+  VSUBMEM(9,%r8,Chimu_10,Chi_10)		\
+  VSUBMEM(10,%r8,Chimu_11,Chi_11)		\
+  VSUBMEM(11,%r8,Chimu_12,Chi_12)		);
+
+//      fspin(0)=hspin(0)
+//      fspin(1)=hspin(1)
+//      fspin(2)=timesMinusI(hspin(1))
+//      fspin(3)=timesMinusI(hspin(0))
+#define XP_RECON __asm__ (			\
+			  VZERO(TMP)		\
+			  VTIMESMINUSI0(UChi_00,psi_30,TMP)	\
+			  VTIMESMINUSI0(UChi_10,psi_20,TMP)	\
+			  VTIMESMINUSI0(UChi_01,psi_31,TMP)	\
+			  VTIMESMINUSI0(UChi_11,psi_21,TMP)	\
+			  VTIMESMINUSI0(UChi_02,psi_32,TMP)   \
+			  VTIMESMINUSI0(UChi_12,psi_22,TMP)	\
+			  VMOV(UChi_00,psi_00)	\
+			  VMOV(UChi_10,psi_10)	\
+			  VMOV(UChi_01,psi_01)	\
+			  VMOV(UChi_11,psi_11)	\
+			  VMOV(UChi_02,psi_02)	\
+			  VMOV(UChi_12,psi_12)	\
+			  VTIMESMINUSI1(UChi_10,psi_20,TMP)	\
+			  VTIMESMINUSI1(UChi_11,psi_21,TMP)	\
+			  VTIMESMINUSI1(UChi_12,psi_22,TMP)	\
+			  VTIMESMINUSI1(UChi_00,psi_30,TMP)	\
+			  VTIMESMINUSI1(UChi_01,psi_31,TMP)	\
+			  VTIMESMINUSI1(UChi_02,psi_32,TMP)   \
+			  VTIMESMINUSI2(UChi_10,psi_20,TMP)	\
+			  VTIMESMINUSI2(UChi_11,psi_21,TMP)	\
+			  VTIMESMINUSI2(UChi_12,psi_22,TMP)	\
+			  VTIMESMINUSI2(UChi_00,psi_30,TMP)	\
+			  VTIMESMINUSI2(UChi_01,psi_31,TMP)	\
+			  VTIMESMINUSI2(UChi_02,psi_32,TMP)   \
+						);
+  // NB could save 6 ops using addsub => 12 cycles
+#define XP_RECON_ACCUM __asm__ ( \
+  VZERO(TMP)\
+  VACCTIMESMINUSI0(UChi_00,psi_30,Z3)\
+  VACCTIMESMINUSI0(UChi_10,psi_20,Z0)\
+  VACCTIMESMINUSI0(UChi_01,psi_31,Z4)\
+  VACCTIMESMINUSI0(UChi_11,psi_21,Z1)\
+  VACCTIMESMINUSI0(UChi_02,psi_32,Z5)\
+  VACCTIMESMINUSI0(UChi_12,psi_22,Z2)\
+  VADD(UChi_00,psi_00,psi_00)\
+  VADD(UChi_10,psi_10,psi_10)\
+  VADD(UChi_01,psi_01,psi_01)\
+  VADD(UChi_11,psi_11,psi_11)\
+  VADD(UChi_02,psi_02,psi_02)\
+  VADD(UChi_12,psi_12,psi_12)\
+  VACCTIMESMINUSI1(UChi_00,psi_30,Z3)\
+  VACCTIMESMINUSI1(UChi_10,psi_20,Z0)\
+  VACCTIMESMINUSI1(UChi_01,psi_31,Z4)\
+  VACCTIMESMINUSI1(UChi_11,psi_21,Z1)\
+  VACCTIMESMINUSI1(UChi_02,psi_32,Z5)\
+  VACCTIMESMINUSI1(UChi_12,psi_22,Z2)\
+  VACCTIMESMINUSI2(UChi_10,psi_20,Z0)\
+  VACCTIMESMINUSI2(UChi_11,psi_21,Z1)\
+  VACCTIMESMINUSI2(UChi_12,psi_22,Z2)\
+  VACCTIMESMINUSI2(UChi_00,psi_30,Z3)\
+  VACCTIMESMINUSI2(UChi_01,psi_31,Z4)\
+  VACCTIMESMINUSI2(UChi_02,psi_32,Z5)\
+				 );
+
+#define XM_RECON __asm__ ( \
+  VZERO(TMP)\
+  VTIMESI0(UChi_00,psi_30,TMP)\
+  VTIMESI0(UChi_10,psi_20,TMP)\
+  VTIMESI0(UChi_01,psi_31,TMP)\
+  VTIMESI0(UChi_11,psi_21,TMP)\
+  VTIMESI0(UChi_02,psi_32,TMP)\
+  VTIMESI0(UChi_12,psi_22,TMP)\
+  VMOV(UChi_00,psi_00)\
+  VMOV(UChi_10,psi_10)\
+  VMOV(UChi_01,psi_01)\
+  VMOV(UChi_11,psi_11)\
+  VMOV(UChi_02,psi_02)\
+  VMOV(UChi_12,psi_12)\
+  VTIMESI1(UChi_00,psi_30,TMP)\
+  VTIMESI1(UChi_10,psi_20,TMP)\
+  VTIMESI1(UChi_01,psi_31,TMP)\
+  VTIMESI1(UChi_11,psi_21,TMP)\
+  VTIMESI1(UChi_02,psi_32,TMP)\
+  VTIMESI1(UChi_12,psi_22,TMP)\
+  VTIMESI2(UChi_10,psi_20,TMP)\
+  VTIMESI2(UChi_11,psi_21,TMP)\
+  VTIMESI2(UChi_12,psi_22,TMP)\
+  VTIMESI2(UChi_00,psi_30,TMP)\
+  VTIMESI2(UChi_01,psi_31,TMP)\
+  VTIMESI2(UChi_02,psi_32,TMP)\
+			   );
+
+#define XM_RECON_ACCUM __asm__ ( \
+  VACCTIMESI0(UChi_10,psi_20,Z0)\
+  VACCTIMESI0(UChi_00,psi_30,Z3)\
+  VACCTIMESI0(UChi_11,psi_21,Z1)\
+  VACCTIMESI0(UChi_01,psi_31,Z4)\
+  VACCTIMESI0(UChi_12,psi_22,Z2)\
+  VACCTIMESI0(UChi_02,psi_32,Z5)\
+  \
+  VADD(UChi_10,psi_10,psi_10)\
+  VADD(UChi_00,psi_00,psi_00)\
+  VADD(UChi_11,psi_11,psi_11)\
+  VADD(UChi_01,psi_01,psi_01)\
+  VADD(UChi_12,psi_12,psi_12)\
+  VADD(UChi_02,psi_02,psi_02)\
+  \
+  VACCTIMESI1(UChi_10,psi_20,Z0)\
+  VACCTIMESI1(UChi_00,psi_30,Z3)\
+  VACCTIMESI1(UChi_11,psi_21,Z1)\
+  VACCTIMESI1(UChi_01,psi_31,Z4)\
+  VACCTIMESI1(UChi_12,psi_22,Z2)\
+  VACCTIMESI1(UChi_02,psi_32,Z5)\
+  VACCTIMESI2(UChi_10,psi_20,Z0)\
+  VACCTIMESI2(UChi_11,psi_21,Z1)\
+  VACCTIMESI2(UChi_12,psi_22,Z2)\
+  VACCTIMESI2(UChi_00,psi_30,Z3)\
+  VACCTIMESI2(UChi_01,psi_31,Z4)\
+  VACCTIMESI2(UChi_02,psi_32,Z5)\
+				 );
+
+#define YP_RECON_ACCUM __asm__ ( \
+  VADD(UChi_00,psi_00,psi_00)\
+  VADD(UChi_10,psi_10,psi_10)\
+  VADD(UChi_01,psi_01,psi_01)\
+  VADD(UChi_11,psi_11,psi_11)\
+  VADD(UChi_02,psi_02,psi_02)\
+  VADD(UChi_12,psi_12,psi_12)\
+  VADD(UChi_10,psi_20,psi_20)\
+  VADD(UChi_11,psi_21,psi_21)\
+  VADD(UChi_12,psi_22,psi_22)\
+  VSUB(UChi_00,psi_30,psi_30)\
+  VSUB(UChi_01,psi_31,psi_31)\
+  VSUB(UChi_02,psi_32,psi_32) );
+
+#define YM_RECON_ACCUM __asm__ ( \
+  VADD(UChi_00,psi_00,psi_00)\
+  VADD(UChi_10,psi_10,psi_10)\
+  VADD(UChi_01,psi_01,psi_01)\
+  VADD(UChi_11,psi_11,psi_11)\
+  VADD(UChi_02,psi_02,psi_02)\
+  VADD(UChi_12,psi_12,psi_12)\
+  VSUB(UChi_10,psi_20,psi_20)\
+  VSUB(UChi_11,psi_21,psi_21)\
+  VSUB(UChi_12,psi_22,psi_22)\
+  VADD(UChi_00,psi_30,psi_30)\
+  VADD(UChi_01,psi_31,psi_31)\
+  VADD(UChi_02,psi_32,psi_32) );
+
+#define ZP_RECON_ACCUM __asm__ ( \
+  VACCTIMESMINUSI0(UChi_00,psi_20,Z0)\
+  VACCTIMESI0(UChi_10,psi_30,Z3)\
+  VACCTIMESMINUSI0(UChi_01,psi_21,Z1)\
+  VACCTIMESI0(UChi_11,psi_31,Z4)\
+  VACCTIMESMINUSI0(UChi_02,psi_22,Z2)\
+  VACCTIMESI0(UChi_12,psi_32,Z5)\
+  VADD(UChi_00,psi_00,psi_00)\
+  VADD(UChi_10,psi_10,psi_10)\
+  VADD(UChi_01,psi_01,psi_01)\
+  VADD(UChi_11,psi_11,psi_11)\
+  VADD(UChi_02,psi_02,psi_02)\
+  VADD(UChi_12,psi_12,psi_12)\
+  VACCTIMESMINUSI1(UChi_00,psi_20,Z0)\
+  VACCTIMESI1(UChi_10,psi_30,Z3)\
+  VACCTIMESMINUSI1(UChi_01,psi_21,Z1)\
+  VACCTIMESI1(UChi_11,psi_31,Z4)\
+  VACCTIMESMINUSI1(UChi_02,psi_22,Z2)\
+  VACCTIMESI1(UChi_12,psi_32,Z5)\
+  VACCTIMESMINUSI2(UChi_00,psi_20,Z0)\
+  VACCTIMESMINUSI2(UChi_01,psi_21,Z1)\
+  VACCTIMESMINUSI2(UChi_02,psi_22,Z2)\
+  VACCTIMESI2(UChi_10,psi_30,Z3)\
+  VACCTIMESI2(UChi_11,psi_31,Z4)\
+  VACCTIMESI2(UChi_12,psi_32,Z5)\
+				 );
+
+#define ZM_RECON_ACCUM __asm__ ( \
+  VACCTIMESI0(UChi_00,psi_20,Z0)\
+  VACCTIMESMINUSI0(UChi_10,psi_30,Z3)\
+  VACCTIMESI0(UChi_01,psi_21,Z1)\
+  VACCTIMESMINUSI0(UChi_11,psi_31,Z4)\
+  VACCTIMESI0(UChi_02,psi_22,Z2)\
+  VACCTIMESMINUSI0(UChi_12,psi_32,Z5)\
+  VADD(UChi_00,psi_00,psi_00)\
+  VADD(UChi_10,psi_10,psi_10)\
+  VADD(UChi_01,psi_01,psi_01)\
+  VADD(UChi_11,psi_11,psi_11)\
+  VADD(UChi_02,psi_02,psi_02)\
+  VADD(UChi_12,psi_12,psi_12)\
+  VACCTIMESI1(UChi_00,psi_20,Z0)\
+  VACCTIMESMINUSI1(UChi_10,psi_30,Z3)\
+  VACCTIMESI1(UChi_01,psi_21,Z1)\
+  VACCTIMESMINUSI1(UChi_11,psi_31,Z4)\
+  VACCTIMESI1(UChi_02,psi_22,Z2)\
+  VACCTIMESMINUSI1(UChi_12,psi_32,Z5)\
+  VACCTIMESI2(UChi_00,psi_20,Z0)\
+  VACCTIMESI2(UChi_01,psi_21,Z1)\
+  VACCTIMESI2(UChi_02,psi_22,Z2)\
+  VACCTIMESMINUSI2(UChi_10,psi_30,Z3)\
+  VACCTIMESMINUSI2(UChi_11,psi_31,Z4)\
+  VACCTIMESMINUSI2(UChi_12,psi_32,Z5)\
+				 );
+
+#define TP_RECON_ACCUM __asm__ ( \
+  VADD(UChi_00,psi_00,psi_00)\
+  VADD(UChi_10,psi_10,psi_10)\
+  VADD(UChi_01,psi_01,psi_01)\
+  VADD(UChi_11,psi_11,psi_11)\
+  VADD(UChi_02,psi_02,psi_02)\
+  VADD(UChi_12,psi_12,psi_12)\
+  VADD(UChi_00,psi_20,psi_20)\
+  VADD(UChi_10,psi_30,psi_30)\
+  VADD(UChi_01,psi_21,psi_21)\
+  VADD(UChi_11,psi_31,psi_31)\
+  VADD(UChi_02,psi_22,psi_22)\
+  VADD(UChi_12,psi_32,psi_32) );
+
+#define TM_RECON_ACCUM __asm__ ( \
+  VADD(UChi_00,psi_00,psi_00)\
+  VADD(UChi_10,psi_10,psi_10)\
+  VADD(UChi_01,psi_01,psi_01)\
+  VADD(UChi_11,psi_11,psi_11)\
+  VADD(UChi_02,psi_02,psi_02)\
+  VADD(UChi_12,psi_12,psi_12)\
+  VSUB(UChi_00,psi_20,psi_20)\
+  VSUB(UChi_10,psi_30,psi_30)\
+  VSUB(UChi_01,psi_21,psi_21)\
+  VSUB(UChi_11,psi_31,psi_31)\
+  VSUB(UChi_02,psi_22,psi_22)\
+  VSUB(UChi_12,psi_32,psi_32) );
+
+#define AVX512_PF_L1
+#define AVX512_PF_L2_GAUGE
+#define AVX512_PF_L2_TABLE
+#undef  AVX512_PF_L2_LINEAR
+
+#ifdef AVX512_PF_L2_TABLE  
+// P1 Fetches the base pointer for next link into L1 with P1
+// M1 Fetches the next site pointer into L2
+#define VPREFETCH_P1(A,B) VPREFETCH1(A,B)
+#define VPREFETCH_P2(A,B) 
+#define VPREFETCH_M1(A,B) VPREFETCH2(A,B)
+#define VPREFETCH_M2(A,B) 
+#endif
+
+#ifdef AVX512_PF_L2_LINEAR
+#define VPREFETCH_M1(A,B) VPREFETCH1(A,B)
+#define VPREFETCH_M2(A,B) VPREFETCH2(A,B)
+#define VPREFETCH_P1(A,B) 
+#define VPREFETCH_P2(A,B)
+#endif
+
+#ifdef AVX512_PF_L2_GAUGE
+#define VPREFETCH_G1(A,B)  VPREFETCH1(A,B)
+#define VPREFETCH_G2(A,B)  VPREFETCH2(A,B)
+#endif
+
+#define PF_GAUGE(A) \
+  LOAD64(%r8,&U._odata[sU](A))						\
+  __asm__ (								\
+	   VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8)			\
+	   VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8)			\
+									);
+
+#define SAVE_RESULTi(PTR,pf)			\
+	   LOAD64(%r8,PTR)			\
+	   LOAD64(%r9,pf)			\
+  __asm__ (					\
+	   VSTORE(0,%r8,psi_00)	VPREFETCH_M1(0,%r9)	\
+	   VSTORE(1,%r8,psi_01)	VPREFETCH_M1(1,%r9)	\
+	   VSTORE(2,%r8,psi_02)	VPREFETCH_M1(2,%r9)	\
+	   VSTORE(3,%r8,psi_10)	VPREFETCH_M1(3,%r9)	\
+	   VSTORE(4,%r8,psi_11)	VPREFETCH_M1(4,%r9)	\
+	   VSTORE(5,%r8,psi_12)	VPREFETCH_M1(5,%r9)	\
+	   VSTORE(6,%r8,psi_20)	VPREFETCH_M1(6,%r9)	\
+	   VSTORE(7,%r8,psi_21)	VPREFETCH_M1(7,%r9)	\
+	   VSTORE(8,%r8,psi_22)	VPREFETCH_M1(8,%r9)	\
+	   VSTORE(9,%r8,psi_30)	VPREFETCH_M1(9,%r9)	\
+	   VSTORE(10,%r8,psi_31)	VPREFETCH_M1(10,%r9)	\
+	   VSTORE(11,%r8,psi_32) 	VPREFETCH_M1(11,%r9)	\
+						);
+
+#define ADD_RESULTi(PTR,pf)						\
+  LOAD_CHIMU(PTR);							\
+  asm(VADD(psi_00,Chimu_00,psi_00)  VADD(psi_01,Chimu_01,psi_01)  VADD(psi_02,Chimu_02,psi_02) \
+      VADD(psi_10,Chimu_10,psi_10)  VADD(psi_11,Chimu_11,psi_11)  VADD(psi_12,Chimu_12,psi_12) \
+      VADD(psi_20,Chimu_20,psi_20)  VADD(psi_21,Chimu_21,psi_21)  VADD(psi_22,Chimu_22,psi_22) \
+      VADD(psi_30,Chimu_30,psi_30)  VADD(psi_31,Chimu_31,psi_31)  VADD(psi_32,Chimu_32,psi_32) ); \
+  SAVE_RESULT(PTR,pf);
+
+
+
+#define ADD_RESULTia(PTR,pf)                                            \
+  LOAD64(%r8,PTR)                                                      \
+  __asm__ (								\
+	   VADDMEM(0,%r8,psi_00,psi_00)				\
+	   VADDMEM(1,%r8,psi_01,psi_01)				\
+	   VADDMEM(2,%r8,psi_02,psi_02)                           \
+	   VADDMEM(3,%r8,psi_10,psi_10)                           \
+	   VADDMEM(4,%r8,psi_11,psi_11)                           \
+	   VADDMEM(5,%r8,psi_12,psi_12)                           \
+	   VADDMEM(6,%r8,psi_20,psi_20)                           \
+	   VADDMEM(7,%r8,psi_21,psi_21)                           \
+	   VADDMEM(8,%r8,psi_22,psi_22)                           \
+	   VADDMEM(9,%r8,psi_30,psi_30)                           \
+	   VADDMEM(10,%r8,psi_31,psi_31)                           \
+	   VADDMEM(11,%r8,psi_32,psi_32)				\
+	   VSTORE(0,%r8,psi_00)                                      \
+	   VSTORE(1,%r8,psi_01)                                      \
+	   VSTORE(2,%r8,psi_02)                                      \
+	   VSTORE(3,%r8,psi_10)                                      \
+	   VSTORE(4,%r8,psi_11)                                      \
+	   VSTORE(5,%r8,psi_12)                                      \
+	   VSTORE(6,%r8,psi_20)                                      \
+	   VSTORE(7,%r8,psi_21)                                      \
+	   VSTORE(8,%r8,psi_22)                                      \
+	   VSTORE(9,%r8,psi_30)                                      \
+	   VSTORE(10,%r8,psi_31)                                     \
+	   VSTORE(11,%r8,psi_32)                                     \
+									);
+
+
+#ifdef AVX512_PF_L2_TABLE
+#define PREFETCH_CHIMU(A)					\
+  LOAD64(%r9,A)							\
+  __asm__ (							\
+	   VPREFETCH_P1(0,%r9)					\
+	   VPREFETCH_P1(1,%r9)					\
+	   VPREFETCH_P1(2,%r9)					\
+	   VPREFETCH_P1(3,%r9)					\
+	   VPREFETCH_P1(4,%r9)					\
+	   VPREFETCH_P1(5,%r9)					\
+	   VPREFETCH_P1(6,%r9)					\
+	   VPREFETCH_P1(7,%r9)					\
+	   VPREFETCH_P1(8,%r9)					\
+	   VPREFETCH_P1(9,%r9)					\
+	   VPREFETCH_P1(10,%r9)					\
+	   VPREFETCH_P1(11,%r9));
+
+#else
+#define PREFETCH_CHIMU(A)
+#endif
+
+#define PREFETCH1_CHIMU(A) \
+  LOAD64(%r9,A)							\
+  __asm__ (							\
+	   VPREFETCH_P1(0,%r9)					\
+	   VPREFETCH_P1(1,%r9)					\
+	   VPREFETCH_P1(2,%r9)					\
+	   VPREFETCH_P1(3,%r9)					\
+	   VPREFETCH_P1(4,%r9)					\
+	   VPREFETCH_P1(5,%r9)					\
+	   VPREFETCH_P1(6,%r9)					\
+	   VPREFETCH_P1(7,%r9)					\
+	   VPREFETCH_P1(8,%r9)					\
+	   VPREFETCH_P1(9,%r9)					\
+	   VPREFETCH_P1(10,%r9)					\
+	   VPREFETCH_P1(11,%r9));
+
+#define PERMUTE_DIR0 __asm__ (			\
+  VPERM0(Chi_00,Chi_00)	\
+  VPERM0(Chi_01,Chi_01)	\
+  VPERM0(Chi_02,Chi_02)	\
+  VPERM0(Chi_10,Chi_10)	\
+  VPERM0(Chi_11,Chi_11)	\
+  VPERM0(Chi_12,Chi_12) );
+
+#define PERMUTE_DIR1 __asm__ (	\
+  VPERM1(Chi_00,Chi_00)	\
+  VPERM1(Chi_01,Chi_01)	\
+  VPERM1(Chi_02,Chi_02)	\
+  VPERM1(Chi_10,Chi_10)	\
+  VPERM1(Chi_11,Chi_11)	\
+  VPERM1(Chi_12,Chi_12));
+
+#define PERMUTE_DIR2 __asm__ (	\
+  VPERM2(Chi_00,Chi_00)	\
+  VPERM2(Chi_01,Chi_01)	\
+  VPERM2(Chi_02,Chi_02)	\
+  VPERM2(Chi_10,Chi_10)	\
+  VPERM2(Chi_11,Chi_11)	\
+  VPERM2(Chi_12,Chi_12) );
+
+#define PERMUTE_DIR3 __asm__ (	\
+  VPERM3(Chi_00,Chi_00)	\
+  VPERM3(Chi_01,Chi_01)	\
+  VPERM3(Chi_02,Chi_02)	\
+  VPERM3(Chi_10,Chi_10)	\
+  VPERM3(Chi_11,Chi_11)	\
+  VPERM3(Chi_12,Chi_12) );
+
+
+#define MULT_ADDSUB_2SPIN(ptr,pf)					\
+  LOAD64(%r8,ptr)						\
+  LOAD64(%r9,pf)						\
+	   __asm__ (						\
+	   VPREFETCH_G2(9,%r8)				   \
+	   VPREFETCH_G2(10,%r8)					   \
+	   VPREFETCH_G2(11,%r8)					   \
+	   VPREFETCH_G2(12,%r8)					   \
+	   VPREFETCH_G2(13,%r8)					   \
+	   VPREFETCH_G2(14,%r8)					   \
+	   VPREFETCH_G2(15,%r8)					   \
+	   VPREFETCH_G2(16,%r8)					   \
+	   VPREFETCH_G2(17,%r8)					   \
+	   VSHUF(Chi_00,T1)				\
+	   VMOVIDUP(0,%r8,Z0 )					\
+           VMOVIDUP(3,%r8,Z1 )					\
+           VMOVIDUP(6,%r8,Z2 )	          VSHUF(Chi_10,T2)		\
+	   /*6*/							\
+           VMUL(Z0,T1,UChi_00)            VMOVRDUP(0,%r8,Z3 )	\
+           VMUL(Z0,T2,UChi_10)            VMOVRDUP(3,%r8,Z4 )	\
+           VMUL(Z1,T1,UChi_01)            VMOVRDUP(6,%r8,Z5 )	\
+           VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 )	\
+           VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 )	\
+           VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 )	\
+	   VPREFETCH_M1(0,%r9)					   \
+	   VPREFETCH_M1(1,%r9)					   \
+	   VPREFETCH_M1(2,%r9)					   \
+	   VPREFETCH_M1(3,%r9)					   \
+	   /*18*/						\
+           VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)	\
+           VMADDSUB(Z3,Chi_10,UChi_10)				\
+           VMADDSUB(Z4,Chi_00,UChi_01)    VMOVRDUP(1,%r8,Z3 )	\
+           VMADDSUB(Z4,Chi_10,UChi_11)    VSHUF(Chi_11,T2)	\
+           VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 )	\
+           VMADDSUB(Z5,Chi_10,UChi_12)				\
+	   VPREFETCH_M1(4,%r9)					   \
+	   VPREFETCH_M1(5,%r9)					   \
+	   VPREFETCH_M1(6,%r9)					   \
+	   VPREFETCH_M1(7,%r9)					   \
+	   /*28*/						\
+           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 )	\
+           VMADDSUB(Z0,T2,UChi_10)				\
+           VMADDSUB(Z1,T1,UChi_01)        VMOVIDUP(2,%r8,Z0 )	\
+           VMADDSUB(Z1,T2,UChi_11)				\
+           VMADDSUB(Z2,T1,UChi_02)        VMOVIDUP(5,%r8,Z1 )	\
+           VMADDSUB(Z2,T2,UChi_12)        VMOVIDUP(8,%r8,Z2 )	\
+	   VPREFETCH2(12,%r9)					   \
+	   VPREFETCH2(13,%r9)					   \
+	   VPREFETCH2(14,%r9)					   \
+	   VPREFETCH2(15,%r9)					   \
+	   VPREFETCH2(16,%r9)					   \
+	   VPREFETCH2(17,%r9)					   \
+	   VPREFETCH2(18,%r9)					   \
+	   VPREFETCH2(19,%r9)					   \
+	   VPREFETCH2(20,%r9)					   \
+	   VPREFETCH2(21,%r9)					   \
+	   VPREFETCH2(22,%r9)					   \
+	   VPREFETCH2(23,%r9)					   \
+           /*38*/						\
+           VMADDSUB(Z3,Chi_01,UChi_00)    VSHUF(Chi_02,T1)	\
+           VMADDSUB(Z3,Chi_11,UChi_10)				\
+           VMADDSUB(Z4,Chi_01,UChi_01)    VMOVRDUP(2,%r8,Z3 )	\
+           VMADDSUB(Z4,Chi_11,UChi_11)    VSHUF(Chi_12,T2)	\
+           VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 )	\
+           VMADDSUB(Z5,Chi_11,UChi_12)				\
+	   VPREFETCH_M1(9,%r8)				   \
+	   VPREFETCH_M1(10,%r8)					   \
+	   VPREFETCH_M1(11,%r8)					   \
+	   VPREFETCH_M1(12,%r8)					   \
+	   VPREFETCH_M1(13,%r8)					   \
+	   VPREFETCH_M1(14,%r8)					   \
+	   VPREFETCH_M1(15,%r8)					   \
+	   VPREFETCH_M1(16,%r8)					   \
+	   VPREFETCH_M1(17,%r8)					   \
+	   /*48*/						\
+           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
+           VMADDSUB(Z0,T2,UChi_10)			      \
+           VMADDSUB(Z1,T1,UChi_01)			      \
+           VMADDSUB(Z1,T2,UChi_11)			      \
+           VMADDSUB(Z2,T1,UChi_02)			      \
+           VMADDSUB(Z2,T2,UChi_12)			      \
+	   VPREFETCH_M1(8,%r9)					   \
+	   VPREFETCH_M1(9,%r9)					   \
+	   VPREFETCH_M1(10,%r9)					   \
+	   VPREFETCH_M1(11,%r9)					   \
+	   /*55*/					      \
+           VMADDSUB(Z3,Chi_02,UChi_00)			      \
+           VMADDSUB(Z3,Chi_12,UChi_10)			      \
+           VMADDSUB(Z4,Chi_02,UChi_01)			      \
+           VMADDSUB(Z4,Chi_12,UChi_11)			      \
+           VMADDSUB(Z5,Chi_02,UChi_02)			      \
+           VMADDSUB(Z5,Chi_12,UChi_12)			      \
+	   /*61 insns*/							);
+
+
+#define MULT_ADDSUB_2SPIN_LS(ptr,pf)				   \
+  LOAD64(%r8,ptr)						   \
+  LOAD64(%r9,pf)						   \
+  __asm__ (							   \
+           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
+           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
+           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
+           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
+	   VPREFETCH_M1(0,%r9)					   \
+	   VPREFETCH_M1(1,%r9)					   \
+	   VPREFETCH_M1(2,%r9)					   \
+	   VPREFETCH_M1(3,%r9)					   \
+	   /*8*/						   \
+           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
+           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
+           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
+           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
+	   VPREFETCH_M1(4,%r9)					   \
+	   VPREFETCH_M1(5,%r9)					   \
+	   VPREFETCH_M1(6,%r9)					   \
+	   VPREFETCH_M1(7,%r9)					   \
+	   /*16*/					  	   \
+           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
+           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
+           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
+	   VPREFETCH_M1(8,%r9)					   \
+	   VPREFETCH_M1(9,%r9)					   \
+	   VPREFETCH_M1(10,%r9)					   \
+	   VPREFETCH_M1(11,%r9)					   \
+           /*22*/						   \
+           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
+           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
+           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
+           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
+	   VPREFETCH_M2(12,%r9)					   \
+	   VPREFETCH_M2(13,%r9)					   \
+	   VPREFETCH_M2(14,%r9)					   \
+	   VPREFETCH_M2(15,%r9)					   \
+	   /*30*/						   \
+           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
+           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
+	   VPREFETCH_M2(16,%r9)					   \
+	   VPREFETCH_M2(17,%r9)					   \
+	   VPREFETCH_M2(18,%r9)					   \
+	   VPREFETCH_M2(19,%r9)					   \
+           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
+	   /*36*/					           \
+           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
+           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
+           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
+	   VPREFETCH_M2(20,%r9)					   \
+	   VPREFETCH_M2(21,%r9)					   \
+	   VPREFETCH_M2(22,%r9)					   \
+	   VPREFETCH_M2(23,%r9)					   \
+	   VPREFETCH_G1(2,%r8)					   \
+	   VPREFETCH_G1(3,%r8)					   \
+	   VPREFETCH_G2(4,%r8)					   \
+	   VPREFETCH_G2(5,%r8)					   \
+	   VPREFETCH_G2(6,%r8)					   \
+	   VPREFETCH_G2(7,%r8)					   \
+	   /*42 insns*/						);
+
+#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \
+  LOAD64(%r8,ptr)						   \
+  LOAD64(%r9,pf)						   \
+  __asm__ (							   \
+           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
+           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
+           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
+           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
+	   /*8*/						   \
+           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
+           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
+           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
+           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
+	   /*16*/					  	   \
+           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
+           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
+           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
+           /*22*/						   \
+           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
+           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
+           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
+           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
+	   /*30*/						   \
+           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
+           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
+           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
+	   /*36*/					           \
+           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
+           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
+           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
+	   /*	   VPREFETCH1(2,%r8)*/				   \
+	   /*	   VPREFETCH1(3,%r8)*/				   \
+	   /*42 insns*/						);
+
+
+#define Z6 Chi_00
+#define MULT_ADDSUB_2SPIN_NEW(ptr,pf)			       \
+  LOAD64(%r8,ptr)					       \
+  __asm__ (							  \
+   VSHUFMEM(0,%r8,Z0)					          \
+   VRDUP(Chi_00,T1)           VIDUP(Chi_00,Chi_00)	          \
+   VRDUP(Chi_10,T2)           VIDUP(Chi_10,Chi_10)		  \
+   VMUL(Z0,Chi_00,Z1)         VMUL(Z0,Chi_10,Z2)		  \
+   VSHUFMEM(3,%r8,Z0)						  \
+   VMUL(Z0,Chi_00,Z3)         VMUL(Z0,Chi_10,Z4)		  \
+   VSHUFMEM(6,%r8,Z0)						  \
+   VMUL(Z0,Chi_00,Z5)         VMUL(Z0,Chi_10,Z6)		  \
+   VMULMEM(0,%r8,T1,UChi_00)  VMULMEM(0,%r8,T2,UChi_10)		  \
+   VMULMEM(3,%r8,T1,UChi_01)  VMULMEM(3,%r8,T2,UChi_11)		  \
+   VMULMEM(6,%r8,T1,UChi_02)  VMULMEM(6,%r8,T2,UChi_12)		  \
+   /*11 cycles*/						  \
+   VSHUFMEM(1,%r8,Z0)						  \
+   VRDUP(Chi_01,T1)           VIDUP(Chi_01,Chi_01)		  \
+   VRDUP(Chi_11,T2)           VIDUP(Chi_11,Chi_11)		  \
+   VMADD(Z0,Chi_01,Z1)        VMADD(Z0,Chi_11,Z2)		  \
+   VSHUFMEM(4,%r8,Z0)						  \
+   VMADD(Z0,Chi_01,Z3)        VMADD(Z0,Chi_11,Z4)		  \
+   VSHUFMEM(7,%r8,Z0)						  \
+   VMADD(Z0,Chi_01,Z5)        VMADD(Z0,Chi_11,Z6)		  \
+   VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10)	  \
+   VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11)	  \
+   VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12)	  \
+   /*22 cycles*/						  \
+   VSHUFMEM(2,%r8,Z0)						  \
+   VRDUP(Chi_02,T1)        VIDUP(Chi_02,Chi_02)			  \
+   VRDUP(Chi_12,T2)        VIDUP(Chi_12,Chi_12)			  \
+   VMADD(Z0,Chi_02,Z1)        VMADD(Z0,Chi_12,Z2)		  \
+   VSHUFMEM(5,%r8,Z0)						  \
+   VMADD(Z0,Chi_02,Z3)        VMADD(Z0,Chi_12,Z4)		  \
+   VSHUFMEM(8,%r8,Z0)						  \
+   VMADD(Z0,Chi_02,Z5)        VMADD(Z0,Chi_12,Z6)		  \
+   /*33 cycles*/						  \
+   VMADDSUBMEM(2,%r8,T1,Z1)   VMADDSUBMEM(2,%r8,T2,Z2)		  \
+   VMADDSUBMEM(5,%r8,T1,Z3)   VMADDSUBMEM(5,%r8,T2,Z4)	          \
+   VMADDSUBMEM(8,%r8,T1,Z5)   VMADDSUBMEM(8,%r8,T2,Z6)	       \
+  /*stall*/						       \
+  /*stall*/						       \
+  /*stall*/						       \
+  VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
+  VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
+  VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) )
+
+
+#endif
--- a/Grid/simd/Simd.h
+++ b/Grid/simd/Simd.h
@@ -0,0 +1,255 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/Simd.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_SIMD_H
+#define GRID_SIMD_H
+
+////////////////////////////////////////////////////////////////////////
+// Define scalar and vector floating point types
+//
+// Scalar:   RealF, RealD, ComplexF, ComplexD
+//
+// Vector:  vRealF, vRealD, vComplexF, vComplexD
+//
+// Vector types are arch dependent
+////////////////////////////////////////////////////////////////////////
+
+
+#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
+#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
+#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
+#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
+#define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
+
+#define RotateBit (0x100)
+
+namespace Grid {
+
+  typedef uint32_t Integer;
+
+  typedef  float  RealF;
+  typedef  double RealD;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+  typedef RealD   Real;
+#else
+  typedef RealF  Real;
+#endif
+
+  typedef std::complex<RealF> ComplexF;
+  typedef std::complex<RealD> ComplexD;
+  typedef std::complex<Real>  Complex;
+
+  inline RealF adj(const RealF  & r){ return r; }
+  inline RealF conjugate(const RealF  & r){ return r; }
+  inline RealF real(const RealF  & r){ return r; }
+
+  inline RealD adj(const RealD  & r){ return r; }
+  inline RealD conjugate(const RealD  & r){ return r; }
+  inline RealD real(const RealD  & r){ return r; }
+
+  inline RealD sqrt(const RealD  & r){ return std::sqrt(r); }
+
+  inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }
+  inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
+  inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }
+  inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }
+
+  inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; }
+  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
+  inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
+  inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
+
+  inline ComplexD Reduce(const ComplexD& r){ return r; }
+  inline ComplexF Reduce(const ComplexF& r){ return r; }
+  inline RealD Reduce(const RealD& r){ return r; }
+  inline RealF Reduce(const RealF& r){ return r; }
+
+  inline RealD toReal(const ComplexD& r){ return real(r); }
+  inline RealF toReal(const ComplexF& r){ return real(r); }
+  inline RealD toReal(const RealD& r){ return r; }
+  inline RealF toReal(const RealF& r){ return r; }
+
+  
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  //Provide support functions for basic real and complex data types required by Grid
+  //Single and double precision versions. Should be able to template this once only.
+  ////////////////////////////////////////////////////////////////////////////////
+  inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
+  inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
+  inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
+  inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
+  // conjugate already supported for complex
+  
+  inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
+  inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
+  inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
+  inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
+  
+  //conjugate already supported for complex
+  
+  inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
+  inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
+  inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
+  inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
+
+  // define projections to real and imaginay parts
+  inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
+  inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
+  inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
+  inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
+
+  // define auxiliary functions for complex computations
+  inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
+  inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
+  inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
+  inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}
+  
+  inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
+  inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
+  inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
+  inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
+  
+  inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){  *y = (*a) * (*x)+(*y); }
+  inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
+  inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
+  inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
+  
+  inline void vstream(ComplexF &l, const ComplexF &r){ l=r;}
+  inline void vstream(ComplexD &l, const ComplexD &r){ l=r;}
+  inline void vstream(RealF &l, const RealF &r){ l=r;}
+  inline void vstream(RealD &l, const RealD &r){ l=r;}
+  
+  
+  class Zero{};
+  static Zero zero;
+  template<class itype> inline void zeroit(itype &arg){ arg=zero;};
+  template<>            inline void zeroit(ComplexF &arg){ arg=0; };
+  template<>            inline void zeroit(ComplexD &arg){ arg=0; };
+  template<>            inline void zeroit(RealF &arg){ arg=0; };
+  template<>            inline void zeroit(RealD &arg){ arg=0; };
+  
+
+  //////////////////////////////////////////////////////////
+  // Permute
+  // Permute 0 every ABCDEFGH -> BA DC FE HG
+  // Permute 1 every ABCDEFGH -> CD AB GH EF
+  // Permute 2 every ABCDEFGH -> EFGH ABCD
+  // Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
+  // Permute 4 possible on half precision @512bit vectors.
+  //
+  // Defined inside SIMD specialization files
+  //////////////////////////////////////////////////////////
+  template<class VectorSIMD>
+    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm);
+
+};
+
+#include <Grid/simd/Grid_vector_types.h>
+#include <Grid/simd/Grid_vector_unops.h>
+
+namespace Grid {
+  // Default precision
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+  typedef vRealD vReal;
+  typedef vComplexD vComplex;
+#else
+  typedef vRealF vReal;
+  typedef vComplexF vComplex;
+#endif
+
+ 
+  inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){
+    int nn=vComplexF::Nsimd();
+    std::vector<ComplexF,alignedAllocator<ComplexF> > buf(nn);
+    vstore(o,&buf[0]);
+    stream<<"<";
+    for(int i=0;i<nn;i++){
+      stream<<buf[i];
+      if(i<nn-1) stream<<",";
+    }
+    stream<<">";
+    return stream;
+  }
+ 
+  inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){
+    int nn=vComplexD::Nsimd();
+    std::vector<ComplexD,alignedAllocator<ComplexD> > buf(nn);
+    vstore(o,&buf[0]);
+    stream<<"<";
+    for(int i=0;i<nn;i++){
+      stream<<buf[i];
+      if(i<nn-1) stream<<",";
+    }
+    stream<<">";
+    return stream;
+  }
+
+  inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){
+    int nn=vRealF::Nsimd();
+    std::vector<RealF,alignedAllocator<RealF> > buf(nn);
+    vstore(o,&buf[0]);
+    stream<<"<";
+    for(int i=0;i<nn;i++){
+      stream<<buf[i];
+      if(i<nn-1) stream<<",";
+    }
+    stream<<">";
+    return stream;
+  }
+
+  inline std::ostream& operator<< (std::ostream& stream, const vRealD &o){
+    int nn=vRealD::Nsimd();
+    std::vector<RealD,alignedAllocator<RealD> > buf(nn);
+    vstore(o,&buf[0]);
+    stream<<"<";
+    for(int i=0;i<nn;i++){
+      stream<<buf[i];
+      if(i<nn-1) stream<<",";
+    }
+    stream<<">";
+    return stream;
+  }
+  inline std::ostream& operator<< (std::ostream& stream, const vInteger &o){
+    int nn=vInteger::Nsimd();
+    std::vector<Integer,alignedAllocator<Integer> > buf(nn);
+    vstore(o,&buf[0]);
+    stream<<"<";
+    for(int i=0;i<nn;i++){
+      stream<<buf[i];
+      if(i<nn-1) stream<<",";
+    }
+    stream<<">";
+    return stream;
+  }
+
+
+}
+#endif
--- a/Grid/simd/l1p.h
+++ b/Grid/simd/l1p.h
@@ -0,0 +1,37 @@
+#pragma once
+namespace Grid {
+// L1p optimisation 
+inline void bgq_l1p_optimisation(int mode)
+{
+#ifdef QPX
+#undef L1P_CFG_PF_USR
+#define L1P_CFG_PF_USR  (0x3fde8000108ll)   /*  (64 bit reg, 23 bits wide, user/unpriv) */
+
+  uint64_t cfg_pf_usr;
+  if ( mode ) { 
+    cfg_pf_usr =
+        L1P_CFG_PF_USR_ifetch_depth(0)       
+      | L1P_CFG_PF_USR_ifetch_max_footprint(1)   
+      | L1P_CFG_PF_USR_pf_stream_est_on_dcbt 
+      | L1P_CFG_PF_USR_pf_stream_establish_enable
+      | L1P_CFG_PF_USR_pf_stream_optimistic
+      | L1P_CFG_PF_USR_pf_adaptive_throttle(0xF) ;
+    //    if ( sizeof(Float) == sizeof(double) ) {
+      cfg_pf_usr |=  L1P_CFG_PF_USR_dfetch_depth(2)| L1P_CFG_PF_USR_dfetch_max_footprint(3)   ;
+      //    } else {
+      //      cfg_pf_usr |=  L1P_CFG_PF_USR_dfetch_depth(1)| L1P_CFG_PF_USR_dfetch_max_footprint(2)   ;
+      //    }
+  } else { 
+    cfg_pf_usr = L1P_CFG_PF_USR_dfetch_depth(1)
+      | L1P_CFG_PF_USR_dfetch_max_footprint(2)   
+      | L1P_CFG_PF_USR_ifetch_depth(0)       
+      | L1P_CFG_PF_USR_ifetch_max_footprint(1)   
+      | L1P_CFG_PF_USR_pf_stream_est_on_dcbt 
+      | L1P_CFG_PF_USR_pf_stream_establish_enable
+      | L1P_CFG_PF_USR_pf_stream_optimistic
+      | L1P_CFG_PF_USR_pf_stream_prefetch_enable;
+  }
+  *((uint64_t *)L1P_CFG_PF_USR) = cfg_pf_usr;
+#endif
+}
+}