1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-17 15:27:06 +01:00

Hadrons: moving Hadrons to root directory, build system improvements

This commit is contained in:
2018-08-28 15:00:40 +01:00
parent 5f206df775
commit fb7d021b9d
499 changed files with 429 additions and 846 deletions

796
Grid/simd/BGQQPX.h Normal file
View File

@ -0,0 +1,796 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/BGQQPX.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ASM_BGQ_QPX_H
#define GRID_ASM_BGQ_QPX_H
#include <stddint.h>
/*********************************************************
* Architectural macros
*********************************************************/
#define VLOADf(OFF,PTR,DEST) "qvlfsux " #DEST "," #OFF "," #PTR ") ;\n"
#define VLOADd(OFF,PTR,DEST) "qvlfdux " #DEST "," #OFF "," #PTR ") ;\n"
#define VSTOREf(OFF,PTR,SRC) "qvstfsux " #SRC "," #OFF "," #PTR ") ;\n"
#define VSTOREd(OFF,PTR,SRC) "qvstfdux " #SRC "," #OFF "," #PTR ") ;\n"
#define VSPLATf(A,B,DEST) "qvlfcdxa " #A "," #B "," #DEST ";\n"
#define VSPLATd(A,B,DEST) "qvlfcsxa " #A "," #B "," #DEST ";\n"
#define LOAD64(A,ptr)
#define VZERO(DEST) "qvfclr " #DEST "; \n"
#define VONE (DEST) "qvfset " #DEST "; \n"
#define VNEG (SRC,DEST) "qvfneg " #DEST "," #SRC "; \n"
#define VMOV(A,DEST) "qvfmr " #DEST, "," #A ";\n"
#define VADD(A,B,DEST) "qvfadd " #DEST "," #A "," #B ";\n"
#define VSUB(A,B,DEST) "qvfsub " #DEST "," #A "," #B ";\n"
#define VMUL(A,B,DEST) "qvfmul " #DEST "," #A "," #B ";\n"
#define VMUL_RR_RI(A,B,DEST) "qvfxmul " #DEST "," #A "," #B ";\n"
#define VMADD(A,B,C,DEST) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
#define VMADD_RR_RI(A,B,C,DEST) "qvfxmadd " #DEST "," #A "," #B ","#C ";\n"
#define VMADD_MII_IR(A,B,C,DEST) "qvfxxnpmadd " #DEST "," #A "," #B ","#C ";\n"
#define VMADD_II_MIR(A,B,C,DEST) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
#define CACHE_LOCK (PTR) asm (" dcbtls %%r0, %0 \n" : : "r" (PTR) );
#define CACHE_UNLOCK(PTR) asm (" dcblc %%r0, %0 \n" : : "r" (PTR) );
#define CACHE_FLUSH (PTR) asm (" dcbf %%r0, %0 \n" : : "r" (PTR) );
#define CACHE_TOUCH (PTR) asm (" dcbt %%r0, %0 \n" : : "r" (PTR) );
// Gauge field locking 2 x 9 complex == 18*8 / 16 bytes per link
// This is 144/288 bytes == 4.5; 9 lines
#define MASK_REGS /*NOOP ON BGQ*/
#define PF_GAUGE(A) /*NOOP ON BGQ*/
#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
#define PREFETCH_CHIMU(base) /*NOOP ON BGQ*/
/*********************************************************
* Register definitions
*********************************************************/
#define psi_00 0
#define psi_01 1
#define psi_02 2
#define psi_10 3
#define psi_11 4
#define psi_12 5
#define psi_20 6
#define psi_21 7
#define psi_22 8
#define psi_30 9
#define psi_31 10
#define psi_32 11
#define Chi_00 12
#define Chi_01 13
#define Chi_02 14
#define Chi_10 15
#define Chi_11 16
#define Chi_12 17
#define UChi_00 18
#define UChi_01 19
#define UChi_02 20
#define UChi_10 21
#define UChi_11 22
#define UChi_12 23
#define U0 24
#define U1 25
#define U2 26
#define one 27
#define REP %%r16
#define IMM %%r17
/*Alias regs*/
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_02
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_02
/*********************************************************
* Macro sequences encoding QCD
*********************************************************/
#define LOCK_GAUGE(dir) \
{ \
uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \
for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \
CACHE_LOCK(&byte_addr[i]); \
} \
}
#define UNLOCK_GAUGE(dir) \
{ \
uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \
for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \
CACHE_UNLOCK(&byte_addr[i]); \
} \
}
#define MAYBEPERM(A,B)
#define PERMUTE_DIR3
#define PERMUTE_DIR2
#define PERMUTE_DIR1
#define PERMUTE_DIR0
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_SPIN(ptr,p) { \
uint64_t ub = ((uint64_t)base); \
asm ( \
VLOAD(%0,%3,U0) \
VLOAD(%1,%3,U1) \
VLOAD(%2,%3,U2) \
VMUL_RR_RI(U0,Chi_00,UChi_00) \
VMUL_RR_RI(U1,Chi_00,UChi_01) \
VMUL_RR_RI(U2,Chi_00,UChi_02) \
VMUL_RR_RI(U0,Chi_10,UChi_10) \
VMUL_RR_RI(U1,Chi_10,UChi_11) \
VMUL_RR_RI(U2,Chi_10,UChi_12) \
VMADD_MII_IR(U0,Chi_00,UChi_00,UChi_00) \
VMADD_MII_IR(U1,Chi_00,UChi_01,UChi_01) \
VMADD_MII_IR(U2,Chi_00,UChi_02,UChi_02) \
VMADD_MII_IR(U0,Chi_10,UChi_10,UChi_10) \
VMADD_MII_IR(U1,Chi_10,UChi_11,UChi_11) \
VMADD_MII_IR(U2,Chi_10,UChi_12,UChi_12) \
: : "r" (0), "r" (32*3), "r" (32*6), "r" (ub )); \
asm ( \
VLOAD(%0,%3,U0) \
VLOAD(%1,%3,U1) \
VLOAD(%2,%3,U2) \
VMADD_RR_RI(U0,Chi_01,UChi_00,UChi_00) \
VMADD_RR_RI(U1,Chi_01,UChi_01,UChi_01) \
VMADD_RR_RI(U2,Chi_01,UChi_02,UChi_02) \
VMADD_RR_RI(U0,Chi_11,UChi_10,UChi_10) \
VMADD_RR_RI(U1,Chi_11,UChi_11,UChi_11) \
VMADD_RR_RI(U2,Chi_11,UChi_12,UChi_12) \
VMADD_MII_IR(U0,Chi_01,UChi_00,UChi_00) \
VMADD_MII_IR(U1,Chi_01,UChi_01,UChi_01) \
VMADD_MII_IR(U2,Chi_01,UChi_02,UChi_02) \
VMADD_MII_IR(U0,Chi_11,UChi_10,UChi_10) \
VMADD_MII_IR(U1,Chi_11,UChi_11,UChi_11) \
VMADD_MII_IR(U2,Chi_11,UChi_12,UChi_12) \
: : "r" (32), "r" (32*4), "r" (32*7), "r" (ub )); \
asm ( \
VLOAD(%0,%3,U0) \
VLOAD(%1,%3,U1) \
VLOAD(%2,%3,U2) \
VMADD_RR_RI(U0,Chi_02,UChi_00,UChi_00) \
VMADD_RR_RI(U1,Chi_02,UChi_01,UChi_01) \
VMADD_RR_RI(U2,Chi_02,UChi_02,UChi_02) \
VMADD_RR_RI(U0,Chi_12,UChi_10,UChi_10) \
VMADD_RR_RI(U1,Chi_12,UChi_11,UChi_11) \
VMADD_RR_RI(U2,Chi_12,UChi_12,UChi_12) \
VMADD_MII_IR(U0,Chi_02,UChi_00,UChi_00) \
VMADD_MII_IR(U1,Chi_02,UChi_01,UChi_01) \
VMADD_MII_IR(U2,Chi_02,UChi_02,UChi_02) \
VMADD_MII_IR(U0,Chi_12,UChi_10,UChi_10) \
VMADD_MII_IR(U1,Chi_12,UChi_11,UChi_11) \
VMADD_MII_IR(U2,Chi_12,UChi_12,UChi_12) \
: : "r" (32*2), "r" (32*5), "r" (32*8), "r" (ub )); \
}
#define SAVE_RESULT(base,basep) {\
uint64_t ub = ((uint64_t)base) - 32; \
asm("mr %0,"REP";\n\t" \
"li " IMM ",32;\n\t" \
VSTORE(IMM,REP,psi_00) \
VSTORE(IMM,REP,psi_01) \
VSTORE(IMM,REP,psi_02) \
VSTORE(IMM,REP,psi_10) \
VSTORE(IMM,REP,psi_11) \
VSTORE(IMM,REP,psi_12) \
VSTORE(IMM,REP,psi_20) \
VSTORE(IMM,REP,psi_21) \
VSTORE(IMM,REP,psi_22) \
VSTORE(IMM,REP,psi_30) \
VSTORE(IMM,REP,psi_31) \
VSTORE(IMM,REP,psi_32) \
); \
}
/*
*Annoying BG/Q loads with no immediat indexing and big performance hit
*when second miss to a L1 line occurs
*/
#define LOAD_CHI(base) { \
uint64_t ub = ((uint64_t)base) - 64; \
asm("mr %0,"REP";\n\t" \
"li " IMM ",64;\n\t" \
VLOAD(IMM,REP,Chi_00) \
VLOAD(IMM,REP,Chi_02) \
VLOAD(IMM,REP,Chi_11) : : "r" (ub) ); \
ub = ((uint64_t)base) - 32; \
asm("mr %0,"REP";\n\t" \
"li IMM,64;\n\t" \
VLOAD(IMM,REP,Chimu_01) \
VLOAD(IMM,REP,Chimu_10) \
VLOAD(IMM,REP,Chimu_12) : : "r" (ub) ); \
}
#define LOAD_CHIMU(base) { \
uint64_t ub = ((uint64_t)base) - 64; \
asm("mr %0,"REP";\n\t" \
"li IMM,64;\n\t" \
VLOAD(IMM,REP,Chimu_00) \
VLOAD(IMM,REP,Chimu_02) \
VLOAD(IMM,REP,Chimu_11) \
VLOAD(IMM,REP,Chimu_20) \
VLOAD(IMM,REP,Chimu_22) \
VLOAD(IMM,REP,Chimu_31) : : "r" (ub) ); \
ub = ((uint64_t)base) - 32; \
asm("mr %0,"REP";\n\t" \
"li IMM,64;\n\t" \
VLOAD(IMM,REP,Chimu_01) \
VLOAD(IMM,REP,Chimu_10) \
VLOAD(IMM,REP,Chimu_12) \
VLOAD(IMM,REP,Chimu_21) \
VLOAD(IMM,REP,Chimu_30) \
VLOAD(IMM,REP,Chimu_32) : : "r" (ub) ); \
}
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
#define XP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_MII_IR(one,Chimu_30,Chimu_00,Chi_00) \
VMADD_MII_IR(one,Chimu_31,Chimu_01,Chi_01) \
VMADD_MII_IR(one,Chimu_32,Chimu_02,Chi_02) \
VMADD_MII_IR(one,Chimu_20,Chimu_10,Chi_10) \
VMADD_MII_IR(one,Chimu_21,Chimu_11,Chi_11) \
VMADD_MII_IR(one,Chimu_22,Chimu_12,Chi_12) \
); \
}
#define XM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_II_MIR(one,Chimu_30,Chimu_00,Chi_00) \
VMADD_II_MIR(one,Chimu_31,Chimu_01,Chi_01) \
VMADD_II_MIR(one,Chimu_32,Chimu_02,Chi_02) \
VMADD_II_MIR(one,Chimu_20,Chimu_10,Chi_10) \
VMADD_II_MIR(one,Chimu_21,Chimu_11,Chi_11) \
VMADD_II_MIR(one,Chimu_22,Chimu_12,Chi_12) \
); \
}
// hspin(0)=fspin(0)-fspin(3);
// hspin(1)=fspin(1)+fspin(2);
#define YP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VSUB(Chimu_00,Chimu_00,Chi_30) \
VSUB(Chimu_01,Chimu_01,Chi_31) \
VSUB(Chimu_02,Chimu_02,Chi_32) \
VADD(Chimu_10,Chimu_10,Chi_20) \
VADD(Chimu_11,Chimu_11,Chi_21) \
VADD(Chimu_12,Chimu_12,Chi_22) \
); \
}
#define YM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VADD(Chimu_00,Chimu_00,Chi_30) \
VADD(Chimu_01,Chimu_01,Chi_31) \
VADD(Chimu_02,Chimu_02,Chi_32) \
VSUB(Chimu_10,Chimu_10,Chi_20) \
VSUB(Chimu_11,Chimu_11,Chi_21) \
VSUB(Chimu_12,Chimu_12,Chi_22) \
); \
}
/*Gz
* 0 0 i 0 [0]+-i[2]
* 0 0 0 -i [1]-+i[3]
* -i 0 0 0
* 0 i 0 0
*/
#define ZP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_MII_IR(one,Chimu_20,Chimu_00,Chi_00) \
VMADD_MII_IR(one,Chimu_21,Chimu_01,Chi_01) \
VMADD_MII_IR(one,Chimu_22,Chimu_02,Chi_02) \
VMADD_II_MIR(one,Chimu_30,Chimu_10,Chi_10) \
VMADD_II_MIR(one,Chimu_31,Chimu_11,Chi_11) \
VMADD_II_MIR(one,Chimu_32,Chimu_12,Chi_12) \
); \
}
#define ZM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_II_MIR(one,Chimu_20,Chimu_00,Chi_00) \
VMADD_II_MIR(one,Chimu_21,Chimu_01,Chi_01) \
VMADD_II_MIR(one,Chimu_22,Chimu_02,Chi_02) \
VMADD_MII_IR(one,Chimu_30,Chimu_10,Chi_10) \
VMADD_MII_IR(one,Chimu_31,Chimu_11,Chi_11) \
VMADD_MII_IR(one,Chimu_32,Chimu_12,Chi_12) \
); \
}
/*Gt
* 0 0 1 0 [0]+-[2]
* 0 0 0 1 [1]+-[3]
* 1 0 0 0
* 0 1 0 0
*/
#define TP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VADD(Chimu_00,Chimu_00,Chi_20) \
VADD(Chimu_01,Chimu_01,Chi_21) \
VADD(Chimu_02,Chimu_02,Chi_22) \
VADD(Chimu_10,Chimu_10,Chi_30) \
VADD(Chimu_11,Chimu_11,Chi_31) \
VADD(Chimu_12,Chimu_12,Chi_32) \
); \
}
#define TM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VSUB(Chimu_00,Chimu_00,Chi_20) \
VSUB(Chimu_01,Chimu_01,Chi_21) \
VSUB(Chimu_02,Chimu_02,Chi_22) \
VSUB(Chimu_10,Chimu_10,Chi_30) \
VSUB(Chimu_11,Chimu_11,Chi_31) \
VSUB(Chimu_12,Chimu_12,Chi_32) \
); \
}
/*
fspin(0)=hspin(0);
fspin(1)=hspin(1);
fspin(2)=timesMinusI(hspin(1));
fspin(3)=timesMinusI(hspin(0));
fspin(0)+=hspin(0);
fspin(1)+=hspin(1);
fspin(2)-=timesI(hspin(1));
fspin(3)-=timesI(hspin(0));
*/
#define XP_RECON { \
asm(\
VONE(one)\
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \
VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \
VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \
VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \
VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \
VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \
); \
}
#define XM_RECON { \
asm(\
VONE(one)\
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \
VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \
VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \
VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \
VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \
VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \
); \
}
#define XP_RECON_ACCUM { \
asm(\
VONE(one)\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \
VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \
VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \
VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \
VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \
VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \
); \
}
#define XM_RECON_ACCUM { \
asm(\
VONE(one)\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \
VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \
VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \
VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \
VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \
VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \
); \
}
// fspin(2)+=hspin(1);
// fspin(3)-=hspin(0);
#define YP_RECON_ACCUM {\
asm(\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VADD(psi_20,UChi_10,psi_20) VADD(psi_21,UChi_11,psi_21) VADD(psi_22,UChi_12,psi_22) \
VSUB(psi_30,UChi_00,psi_30) VSUB(psi_31,UChi_01,psi_31) VSUB(psi_32,UChi_02,psi_32) \
);\
}
#define YM_RECON_ACCUM {\
asm(\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VSUB(psi_20,UChi_10,psi_20) VSUB(psi_21,UChi_11,psi_21) VSUB(psi_22,UChi_12,psi_22) \
VADD(psi_30,UChi_00,psi_30) VADD(psi_31,UChi_01,psi_31) VADD(psi_32,UChi_02,psi_32) \
);\
}
// fspin(2)-=timesI(hspin(0));
// fspin(3)+=timesI(hspin(1));
#define ZP_RECON_ACCUM {\
asm(\
VONE(one)\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VMADD_II_MIR(one,UChi_00,psi_20,psi_20) \
VMADD_II_MIR(one,UChi_01,psi_21,psi_21) \
VMADD_II_MIR(one,UChi_02,psi_22,psi_22) \
VMADD_MII_IR(one,UChi_10,psi_30,psi_30) \
VMADD_MII_IR(one,UChi_11,psi_31,psi_31) \
VMADD_MII_IR(one,UChi_12,psi_32,psi_32) \
);\
}
#define ZM_RECON_ACCUM {\
asm(\
VONE(one)\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VMADD_MII_IR(one,UChi_00,psi_20,psi_20) \
VMADD_MII_IR(one,UChi_01,psi_21,psi_21) \
VMADD_MII_IR(one,UChi_02,psi_22,psi_22) \
VMADD_II_MIR(one,UChi_10,psi_30,psi_30) \
VMADD_II_MIR(one,UChi_11,psi_31,psi_31) \
VMADD_II_MIR(one,UChi_12,psi_32,psi_32) \
);\
}
// fspin(2)+=hspin(0);
// fspin(3)+=hspin(1);
#define TP_RECON_ACCUM {\
asm(\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VADD(psi_20,UChi_00,psi_20) VADD(psi_21,UChi_01,psi_21) VADD(psi_22,UChi_02,psi_22) \
VADD(psi_30,UChi_10,psi_30) VADD(psi_31,UChi_11,psi_31) VADD(psi_32,UChi_12,psi_32) \
);\
}
#define TM_RECON_ACCUM {\
asm(\
VONE(one)\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VSUB(psi_20,UChi_00,psi_20) VSUB(psi_21,UChi_01,psi_21) VSUB(psi_22,UChi_02,psi_22) \
VSUB(psi_30,UChi_10,psi_30) VSUB(psi_31,UChi_11,psi_31) VSUB(psi_32,UChi_12,psi_32) \
);\
}
uint64_t GetPFInfo(int nent,int plocal);
uint64_t GetInfo(int ptype,int local,int perm,int Xp,int ent,int plocal);
#define COMPLEX_TYPE int;
int signs[4];
void testme(int osites,int ssU)
{
int local,perm, ptype;
uint64_t base;
uint64_t basep;
const uint64_t plocal =(uint64_t) & in._odata[0];
// vComplexF isigns[2] = { signs[0], signs[1] };
//COMPLEX_TYPE is vComplexF of vComplexD depending
//on the chosen precision
COMPLEX_TYPE *isigns = &signs[0];
MASK_REGS;
int nmax=osites;
for(int site=0;site<Ns;site++) {
int sU =ssU;
int ssn=ssU+1;
if(ssn>=nmax) ssn=0;
int sUn=ssn;
for(int s=0;s<Ls;s++) {
ss =sU*Ls+s;
ssn=sUn*Ls+s;
////////////////////////////////
// Xp
////////////////////////////////
int ent=ss*8;// 2*Ndim
int nent=ssn*8;
PF_GAUGE(Xp);
base = GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
PREFETCH1_CHIMU(base);
basep = GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns);
#ifdef KERNEL_DAG
XP_PROJMEM(base);
#else
XM_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR3,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFXP(Xp,basep);
}
LOAD64(%r10,isigns);
#ifdef KERNEL_DAG
XP_RECON;
#else
XM_RECON;
#endif
////////////////////////////////
// Yp
////////////////////////////////
basep = GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
YP_PROJMEM(base);
#else
YM_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR2,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Zp,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFYP(Yp,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
YP_RECON_ACCUM;
#else
YM_RECON_ACCUM;
#endif
////////////////////////////////
// Zp
////////////////////////////////
basep = GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
ZP_PROJMEM(base);
#else
ZM_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR1,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Tp,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFZP(Zp,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
ZP_RECON_ACCUM;
#else
ZM_RECON_ACCUM;
#endif
////////////////////////////////
// Tp
////////////////////////////////
basep = GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
TP_PROJMEM(base);
#else
TM_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR0,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Xm,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFTP(Tp,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
TP_RECON_ACCUM;
#else
TM_RECON_ACCUM;
#endif
////////////////////////////////
// Xm
////////////////////////////////
#ifndef STREAM_STORE
basep= (uint64_t) &out._odata[ss];
#endif
// basep= GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
XM_PROJMEM(base);
#else
XP_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR3,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Ym,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFXM(Xm,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
XM_RECON_ACCUM;
#else
XP_RECON_ACCUM;
#endif
////////////////////////////////
// Ym
////////////////////////////////
basep= GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
YM_PROJMEM(base);
#else
YP_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR2,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Zm,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFYM(Ym,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
YM_RECON_ACCUM;
#else
YP_RECON_ACCUM;
#endif
////////////////////////////////
// Zm
////////////////////////////////
basep= GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
ZM_PROJMEM(base);
#else
ZP_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR1,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Tm,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFZM(Zm,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
ZM_RECON_ACCUM;
#else
ZP_RECON_ACCUM;
#endif
////////////////////////////////
// Tm
////////////////////////////////
basep= GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
TM_PROJMEM(base);
#else
TP_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR0,perm);
} else {
LOAD_CHI(base);
}
base= (uint64_t) &out._odata[ss];
#ifndef STREAM_STORE
PREFETCH_CHIMU(base);
#endif
{
MULT_2SPIN_DIR_PFTM(Tm,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
TM_RECON_ACCUM;
#else
TP_RECON_ACCUM;
#endif
basep= GetPFInfo(nent,plocal); nent++;
SAVE_RESULT(base,basep);
}
ssU++;
}
}
#endif

769
Grid/simd/Grid_avx.h Normal file
View File

@ -0,0 +1,769 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_avx.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <immintrin.h>
#ifdef AVXFMA4
#include <x86intrin.h>
#endif
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
#ifndef _mm256_set_m128i
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
#endif
namespace Grid {
namespace Optimization {
template<class vtype>
union uconv {
__m256 f;
vtype v;
};
union u256f {
__m256 v;
float f[8];
};
union u256d {
__m256d v;
double f[4];
};
struct Vsplat{
// Complex float
inline __m256 operator()(float a, float b) {
return _mm256_set_ps(b,a,b,a,b,a,b,a);
}
// Real float
inline __m256 operator()(float a){
return _mm256_set_ps(a,a,a,a,a,a,a,a);
}
//Complex double
inline __m256d operator()(double a, double b){
return _mm256_set_pd(b,a,b,a);
}
//Real double
inline __m256d operator()(double a){
return _mm256_set_pd(a,a,a,a);
}
//Integer
inline __m256i operator()(Integer a){
return _mm256_set1_epi32(a);
}
};
struct Vstore{
//Float
inline void operator()(__m256 a, float* F){
_mm256_store_ps(F,a);
}
//Double
inline void operator()(__m256d a, double* D){
_mm256_store_pd(D,a);
}
//Integer
inline void operator()(__m256i a, Integer* I){
_mm256_store_si256((__m256i*)I,a);
}
};
struct Vstream{
//Float
inline void operator()(float * a, __m256 b){
_mm256_stream_ps(a,b);
}
//Double
inline void operator()(double * a, __m256d b){
_mm256_stream_pd(a,b);
}
};
struct Vset{
// Complex float
inline __m256 operator()(Grid::ComplexF *a){
return _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
}
// Complex double
inline __m256d operator()(Grid::ComplexD *a){
return _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
}
// Real float
inline __m256 operator()(float *a){
return _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
// Real double
inline __m256d operator()(double *a){
return _mm256_set_pd(a[3],a[2],a[1],a[0]);
}
// Integer
inline __m256i operator()(Integer *a){
return _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
};
template <typename Out_type, typename In_type>
struct Reduce{
// Need templated class to overload output type
// General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
//Complex/Real float
inline __m256 operator()(__m256 a, __m256 b){
return _mm256_add_ps(a,b);
}
//Complex/Real double
inline __m256d operator()(__m256d a, __m256d b){
return _mm256_add_pd(a,b);
}
//Integer
inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0);
b0 = _mm256_extractf128_si256(b,0);
a1 = _mm256_extractf128_si256(a,1);
b1 = _mm256_extractf128_si256(b,1);
a0 = _mm_add_epi32(a0,b0);
a1 = _mm_add_epi32(a1,b1);
return _mm256_set_m128i(a1,a0);
#endif
#if defined (AVX2)
return _mm256_add_epi32(a,b);
#endif
}
};
struct Sub{
//Complex/Real float
inline __m256 operator()(__m256 a, __m256 b){
return _mm256_sub_ps(a,b);
}
//Complex/Real double
inline __m256d operator()(__m256d a, __m256d b){
return _mm256_sub_pd(a,b);
}
//Integer
inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0);
b0 = _mm256_extractf128_si256(b,0);
a1 = _mm256_extractf128_si256(a,1);
b1 = _mm256_extractf128_si256(b,1);
a0 = _mm_sub_epi32(a0,b0);
a1 = _mm_sub_epi32(a1,b1);
return _mm256_set_m128i(a1,a0);
#endif
#if defined (AVX2)
return _mm256_sub_epi32(a,b);
#endif
}
};
struct MultRealPart{
inline __m256 operator()(__m256 a, __m256 b){
__m256 ymm0;
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
return _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
}
inline __m256d operator()(__m256d a, __m256d b){
__m256d ymm0;
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
return _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
}
};
struct MaddRealPart{
inline __m256 operator()(__m256 a, __m256 b, __m256 c){
__m256 ymm0 = _mm256_moveldup_ps(a); // ymm0 <- ar ar,
return _mm256_add_ps(_mm256_mul_ps( ymm0, b),c);
}
inline __m256d operator()(__m256d a, __m256d b, __m256d c){
__m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 );
return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c);
}
};
struct MultComplex{
// Complex float
inline __m256 operator()(__m256 a, __m256 b){
#if defined (AVX1)
__m256 ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
// FIXME AVX2 could MAC
ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm256_addsub_ps(ymm0,ymm1);
#endif
#if defined (AVXFMA4)
__m256 a_real = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ar ar,
__m256 a_imag = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ai ai
__m256 tmp = _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1));
a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
#if defined (AVX2) || defined (AVXFMA)
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
}
// Complex double
inline __m256d operator()(__m256d a, __m256d b) {
// Multiplication of (ak+ibk)*(ck+idk)
// a + i b can be stored as a data structure
// From intel optimisation reference guide
/*
movsldup xmm0, Src1; load real parts into the destination,
; a1, a1, a0, a0
movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
VSHUFPD (VEX.256 encoded version)
IF IMM0[0] = 0
THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
IF IMM0[1] = 0
THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
IF IMM0[2] = 0
THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
IF IMM0[3] = 0
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged
*/
#if defined (AVX1)
__m256d ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
ymm1 = _mm256_shuffle_pd(b,b,0x5); // ymm1 <- br,bi b'01,01
ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai b'11,11
ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm256_addsub_pd(ymm0,ymm1);
#endif
#if defined (AVXFMA4)
__m256d a_real = _mm256_shuffle_pd(a,a,0x0);//arar
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
#if defined (AVX2) || defined (AVXFMA)
__m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
}
};
#if 0
struct ComplexDot {
inline void Prep(__m256 ari,__m256 &air) {
cdotRIperm(ari,air);
}
inline void Mul(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
riir=air*b;
iirr=arr*b;
};
inline void Madd(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
mac(riir,air,b);
mac(iirr,ari,b);
}
inline void End(__m256 ari,__m256 &air) {
// cdotRI
}
};
#endif
struct Mult{
inline void mac(__m256 &a, __m256 b, __m256 c){
#if defined (AVX1)
a= _mm256_add_ps(_mm256_mul_ps(b,c),a);
#endif
#if defined (AVXFMA4)
a= _mm256_macc_ps(b,c,a);
#endif
#if defined (AVX2) || defined (AVXFMA)
a= _mm256_fmadd_ps( b, c, a);
#endif
}
inline void mac(__m256d &a, __m256d b, __m256d c){
#if defined (AVX1)
a= _mm256_add_pd(_mm256_mul_pd(b,c),a);
#endif
#if defined (AVXFMA4)
a= _mm256_macc_pd(b,c,a);
#endif
#if defined (AVX2) || defined (AVXFMA)
a= _mm256_fmadd_pd( b, c, a);
#endif
}
// Real float
inline __m256 operator()(__m256 a, __m256 b){
return _mm256_mul_ps(a,b);
}
// Real double
inline __m256d operator()(__m256d a, __m256d b){
return _mm256_mul_pd(a,b);
}
// Integer
inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1) || defined (AVXFMA)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0);
b0 = _mm256_extractf128_si256(b,0);
a1 = _mm256_extractf128_si256(a,1);
b1 = _mm256_extractf128_si256(b,1);
a0 = _mm_mullo_epi32(a0,b0);
a1 = _mm_mullo_epi32(a1,b1);
return _mm256_set_m128i(a1,a0);
#endif
#if defined (AVX2)
return _mm256_mullo_epi32(a,b);
#endif
}
};
struct Div {
// Real float
inline __m256 operator()(__m256 a, __m256 b) {
return _mm256_div_ps(a, b);
}
// Real double
inline __m256d operator()(__m256d a, __m256d b){
return _mm256_div_pd(a,b);
}
};
struct Conj{
// Complex single
inline __m256 operator()(__m256 in){
return _mm256_xor_ps(_mm256_addsub_ps(_mm256_setzero_ps(),in), _mm256_set1_ps(-0.f));
}
// Complex double
inline __m256d operator()(__m256d in){
return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline __m256 operator()(__m256 in, __m256 ret){
__m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i
return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
}
//Complex double
inline __m256d operator()(__m256d in, __m256d ret){
__m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i
return _mm256_shuffle_pd(tmp,tmp,0x5);
}
};
struct TimesI{
//Complex single
inline __m256 operator()(__m256 in, __m256 ret){
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r
}
//Complex double
inline __m256d operator()(__m256d in, __m256d ret){
__m256d tmp = _mm256_shuffle_pd(in,in,0x5);
return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r
}
};
//////////////////////////////////////////////
// Some Template specialization
//////////////////////////////////////////////
struct Permute{
static inline __m256 Permute0(__m256 in){
return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD
};
static inline __m256 Permute1(__m256 in){
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF
};
static inline __m256 Permute2(__m256 in){
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG
};
static inline __m256 Permute3(__m256 in){
return in;
};
static inline __m256d Permute0(__m256d in){
return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB
};
static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC
return _mm256_shuffle_pd(in,in,0x5);
};
static inline __m256d Permute2(__m256d in){
return in;
};
static inline __m256d Permute3(__m256d in){
return in;
};
};
#define USE_FP16
struct PrecisionChange {
static inline __m256i StoH (__m256 a,__m256 b) {
__m256i h;
#ifdef USE_FP16
__m128i ha = _mm256_cvtps_ph(a,0);
__m128i hb = _mm256_cvtps_ph(b,0);
h =(__m256i) _mm256_castps128_ps256((__m128)ha);
h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1);
#else
assert(0);
#endif
return h;
}
static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) {
#ifdef USE_FP16
sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0));
sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1));
#else
assert(0);
#endif
}
static inline __m256 DtoS (__m256d a,__m256d b) {
__m128 sa = _mm256_cvtpd_ps(a);
__m128 sb = _mm256_cvtpd_ps(b);
__m256 s = _mm256_castps128_ps256(sa);
s = _mm256_insertf128_ps(s,sb,1);
return s;
}
static inline void StoD (__m256 s,__m256d &a,__m256d &b) {
a = _mm256_cvtps_pd(_mm256_extractf128_ps(s,0));
b = _mm256_cvtps_pd(_mm256_extractf128_ps(s,1));
}
static inline __m256i DtoH (__m256d a,__m256d b,__m256d c,__m256d d) {
__m256 sa,sb;
sa = DtoS(a,b);
sb = DtoS(c,d);
return StoH(sa,sb);
}
static inline void HtoD (__m256i h,__m256d &a,__m256d &b,__m256d &c,__m256d &d) {
__m256 sa,sb;
HtoS(h,sa,sb);
StoD(sa,a,b);
StoD(sb,c,d);
}
};
struct Exchange{
// 3210 ordering
static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
//Invertible
//AB CD -> AC BD
//AC BD -> AB CD
out1= _mm256_permute2f128_ps(in1,in2,0x20);
out2= _mm256_permute2f128_ps(in1,in2,0x31);
};
static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
//Invertible
// ABCD EFGH ->ABEF CDGH
// ABEF CDGH ->ABCD EFGH
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
// Invertible ?
// ABCD EFGH -> ACEG BDFH
// ACEG BDFH -> AEBF CGDH
// out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
// out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
// Bollocks; need
// AECG BFDH -> ABCD EFGH
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/
out1= _mm256_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
out2= _mm256_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
};
static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
assert(0);
return;
};
static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
out1= _mm256_permute2f128_pd(in1,in2,0x20);
out2= _mm256_permute2f128_pd(in1,in2,0x31);
return;
};
static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
out1= _mm256_shuffle_pd(in1,in2,0x0);
out2= _mm256_shuffle_pd(in1,in2,0xF);
};
static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
assert(0);
return;
};
static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
assert(0);
return;
};
};
#if defined (AVX2)
#define _mm256_alignr_epi32_grid(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
#define _mm256_alignr_epi64_grid(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
#endif
#if defined (AVX1) || defined (AVXFMA)
#define _mm256_alignr_epi32_grid(ret,a,b,n) { \
__m128 aa, bb; \
\
aa = _mm256_extractf128_ps(a,1); \
bb = _mm256_extractf128_ps(b,1); \
aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \
ret = _mm256_insertf128_ps(ret,aa,1); \
\
aa = _mm256_extractf128_ps(a,0); \
bb = _mm256_extractf128_ps(b,0); \
aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \
ret = _mm256_insertf128_ps(ret,aa,0); \
}
#define _mm256_alignr_epi64_grid(ret,a,b,n) { \
__m128d aa, bb; \
\
aa = _mm256_extractf128_pd(a,1); \
bb = _mm256_extractf128_pd(b,1); \
aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \
ret = _mm256_insertf128_pd(ret,aa,1); \
\
aa = _mm256_extractf128_pd(a,0); \
bb = _mm256_extractf128_pd(b,0); \
aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \
ret = _mm256_insertf128_pd(ret,aa,0); \
}
#endif
struct Rotate{
static inline __m256 rotate(__m256 in,int n){
switch(n){
case 0: return tRotate<0>(in);break;
case 1: return tRotate<1>(in);break;
case 2: return tRotate<2>(in);break;
case 3: return tRotate<3>(in);break;
case 4: return tRotate<4>(in);break;
case 5: return tRotate<5>(in);break;
case 6: return tRotate<6>(in);break;
case 7: return tRotate<7>(in);break;
default: assert(0);
}
}
static inline __m256d rotate(__m256d in,int n){
switch(n){
case 0: return tRotate<0>(in);break;
case 1: return tRotate<1>(in);break;
case 2: return tRotate<2>(in);break;
case 3: return tRotate<3>(in);break;
default: assert(0);
}
}
template<int n>
static inline __m256 tRotate(__m256 in){
__m256 tmp = Permute::Permute0(in);
__m256 ret;
if ( n > 3 ) {
_mm256_alignr_epi32_grid(ret,in,tmp,n);
} else {
_mm256_alignr_epi32_grid(ret,tmp,in,n);
}
return ret;
}
template<int n>
static inline __m256d tRotate(__m256d in){
__m256d tmp = Permute::Permute0(in);
__m256d ret;
if ( n > 1 ) {
_mm256_alignr_epi64_grid(ret,in,tmp,n);
} else {
_mm256_alignr_epi64_grid(ret,tmp,in,n);
}
return ret;
};
};
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
__m256 v1,v2;
v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single
v1= _mm256_add_ps(v1,in);
v2=Optimization::Permute::Permute1(v1);
v1 = _mm256_add_ps(v1,v2);
u256f conv; conv.v = v1;
return Grid::ComplexF(conv.f[0],conv.f[1]);
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
__m256 v1,v2;
v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double
v1 = _mm256_add_ps(v1,in);
v2 = Optimization::Permute::Permute1(v1);
v1 = _mm256_add_ps(v1,v2);
v2 = Optimization::Permute::Permute2(v1);
v1 = _mm256_add_ps(v1,v2);
u256f conv; conv.v=v1;
return conv.f[0];
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
__m256d v1;
v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
v1 = _mm256_add_pd(v1,in);
u256d conv; conv.v = v1;
return Grid::ComplexD(conv.f[0],conv.f[1]);
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
__m256d v1,v2;
v1 = Optimization::Permute::Permute0(in); // avx 256; quad double
v1 = _mm256_add_pd(v1,in);
v2 = Optimization::Permute::Permute1(v1);
v1 = _mm256_add_pd(v1,v2);
u256d conv; conv.v = v1;
return conv.f[0];
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
__m128i ret;
#if defined (AVX2)
// AVX2 horizontal adds within upper and lower halves of register; use
// SSE to add upper and lower halves for result.
__m256i v1, v2;
__m128i u1, u2;
v1 = _mm256_hadd_epi32(in, in);
v2 = _mm256_hadd_epi32(v1, v1);
u1 = _mm256_castsi256_si128(v2); // upper half
u2 = _mm256_extracti128_si256(v2, 1); // lower half
ret = _mm_add_epi32(u1, u2);
#else
// No AVX horizontal add; extract upper and lower halves of register & use
// SSE intrinsics.
__m128i u1, u2, u3;
u1 = _mm256_extractf128_si256(in, 0); // upper half
u2 = _mm256_extractf128_si256(in, 1); // lower half
u3 = _mm_add_epi32(u1, u2);
u1 = _mm_hadd_epi32(u3, u3);
ret = _mm_hadd_epi32(u1, u1);
#endif
return _mm_cvtsi128_si32(ret);
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
typedef __m256i SIMD_Htype; // Single precision type
typedef __m256 SIMD_Ftype; // Single precision type
typedef __m256d SIMD_Dtype; // Double precision type
typedef __m256i SIMD_Itype; // Integer type
// prefecthing
inline void v_prefetch0(int size, const char *ptr){
for(int i=0;i<size;i+=64){ // Define L1 linesize above
_mm_prefetch(ptr+i+4096,_MM_HINT_T1);
_mm_prefetch(ptr+i+512,_MM_HINT_T0);
}
}
inline void prefetch_HINT_T0(const char *ptr){
_mm_prefetch(ptr, _MM_HINT_T0);
}
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S, T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
} // namespace Grid

640
Grid/simd/Grid_avx512.h Normal file
View File

@ -0,0 +1,640 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_avx512.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <immintrin.h>
namespace Grid{
namespace Optimization {
union u512f {
__m512 v;
float f[16];
};
union u512d {
__m512d v;
double f[8];
};
struct Vsplat{
//Complex float
inline __m512 operator()(float a, float b){
return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
}
// Real float
inline __m512 operator()(float a){
return _mm512_set1_ps(a);
}
//Complex double
inline __m512d operator()(double a, double b){
return _mm512_set_pd(b,a,b,a,b,a,b,a);
}
//Real double
inline __m512d operator()(double a){
return _mm512_set1_pd(a);
}
//Integer
inline __m512i operator()(Integer a){
return _mm512_set1_epi32(a);
}
};
struct Vstore{
//Float
inline void operator()(__m512 a, float* F){
_mm512_store_ps(F,a);
}
//Double
inline void operator()(__m512d a, double* D){
_mm512_store_pd(D,a);
}
//Integer
inline void operator()(__m512i a, Integer* I){
_mm512_store_si512((__m512i *)I,a);
}
};
struct Vstream{
//Float
inline void operator()(float * a, __m512 b){
_mm512_stream_ps(a,b);
// _mm512_store_ps(a,b);
}
//Double
inline void operator()(double * a, __m512d b){
_mm512_stream_pd(a,b);
// _mm512_store_pd(a,b);
}
};
struct Vset{
// Complex float
inline __m512 operator()(Grid::ComplexF *a){
return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
}
// Complex double
inline __m512d operator()(Grid::ComplexD *a){
return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
}
// Real float
inline __m512 operator()(float *a){
return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
// Real double
inline __m512d operator()(double *a){
return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
// Integer
inline __m512i operator()(Integer *a){
return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
};
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
//Complex/Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_add_ps(a,b);
}
//Complex/Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_add_pd(a,b);
}
//Integer
inline __m512i operator()(__m512i a, __m512i b){
return _mm512_add_epi32(a,b);
}
};
struct Sub{
//Complex/Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_sub_ps(a,b);
}
//Complex/Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_sub_pd(a,b);
}
//Integer
inline __m512i operator()(__m512i a, __m512i b){
return _mm512_sub_epi32(a,b);
}
};
// Note, we can beat the shuf overhead in chain with two temporaries
// Ar Ai , Br Bi, Ai Ar // one shuf
//tmpr Ar Br, Ai Bi // Mul/Mac/Mac
//tmpi Br Ai, Bi Ar // Mul/Mac/Mac
// add tmpi,shuf(tmpi)
// sub tmpr,shuf(tmpi)
// shuf(tmpr,tmpi). // Could drop/trade for write mask
// Gives
// 2mul,4 mac +add+sub = 8 flop type insns
// 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load.
struct MultRealPart{
inline __m512 operator()(__m512 a, __m512 b){
__m512 ymm0;
ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
return _mm512_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
}
inline __m512d operator()(__m512d a, __m512d b){
__m512d ymm0;
ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00
return _mm512_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
}
};
struct MaddRealPart{
inline __m512 operator()(__m512 a, __m512 b, __m512 c){
__m512 ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
return _mm512_fmadd_ps( ymm0, b, c);
}
inline __m512d operator()(__m512d a, __m512d b, __m512d c){
__m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 );
return _mm512_fmadd_pd( ymm0, b, c);
}
};
struct MultComplex{
// Complex float
inline __m512 operator()(__m512 a, __m512 b){
// dup, dup, perm, mul, madd
__m512 a_real = _mm512_moveldup_ps( a ); // Ar Ar
__m512 a_imag = _mm512_movehdup_ps( a ); // Ai Ai
a_imag = _mm512_mul_ps( a_imag, _mm512_permute_ps( b, 0xB1 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm512_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
}
// Complex double
inline __m512d operator()(__m512d a, __m512d b){
__m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
__m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
return _mm512_fmaddsub_pd( a_real, b, a_imag );
}
};
struct Mult{
inline void mac(__m512 &a, __m512 b, __m512 c){
a= _mm512_fmadd_ps( b, c, a);
}
inline void mac(__m512d &a, __m512d b, __m512d c){
a= _mm512_fmadd_pd( b, c, a);
}
// Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_mul_ps(a,b);
}
// Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_mul_pd(a,b);
}
// Integer
inline __m512i operator()(__m512i a, __m512i b){
return _mm512_mullo_epi32(a,b);
}
};
struct Div{
// Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_div_ps(a,b);
}
// Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_div_pd(a,b);
}
};
struct Conj{
// Complex single
inline __m512 operator()(__m512 in){
return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag
}
// Complex double
inline __m512d operator()(__m512d in){
return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
//__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
//return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E??
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
//__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
//return _mm512_shuffle_pd(tmp,tmp,0x55);
__m512d tmp = _mm512_shuffle_pd(in,in,0x55);
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
}
};
struct TimesI{
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp);
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_shuffle_pd(in,in,0x55);
return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp);
}
};
// Gpermute utilities consider coalescing into 1 Gpermute
struct Permute{
static inline __m512 Permute0(__m512 in){
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512 Permute1(__m512 in){
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512 Permute2(__m512 in){
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512 Permute3(__m512 in){
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512d Permute0(__m512d in){
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512d Permute1(__m512d in){
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512d Permute2(__m512d in){
return _mm512_shuffle_pd(in,in,0x55);
};
static inline __m512d Permute3(__m512d in){
return in;
};
};
#define USE_FP16
struct PrecisionChange {
static inline __m512i StoH (__m512 a,__m512 b) {
__m512i h;
#ifdef USE_FP16
__m256i ha = _mm512_cvtps_ph(a,0);
__m256i hb = _mm512_cvtps_ph(b,0);
h =(__m512i) _mm512_castps256_ps512((__m256)ha);
h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m256d)hb,1);
#else
assert(0);
#endif
return h;
}
static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) {
#ifdef USE_FP16
sa = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,0));
sb = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,1));
#else
assert(0);
#endif
}
static inline __m512 DtoS (__m512d a,__m512d b) {
__m256 sa = _mm512_cvtpd_ps(a);
__m256 sb = _mm512_cvtpd_ps(b);
__m512 s = _mm512_castps256_ps512(sa);
s =(__m512) _mm512_insertf64x4((__m512d)s,(__m256d)sb,1);
return s;
}
static inline void StoD (__m512 s,__m512d &a,__m512d &b) {
a = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,0));
b = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,1));
}
static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) {
__m512 sa,sb;
sa = DtoS(a,b);
sb = DtoS(c,d);
return StoH(sa,sb);
}
static inline void HtoD (__m512i h,__m512d &a,__m512d &b,__m512d &c,__m512d &d) {
__m512 sa,sb;
HtoS(h,sa,sb);
StoD(sa,a,b);
StoD(sb,c,d);
}
};
// On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
// On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
// The operation is its own inverse
struct Exchange{
// 3210 ordering
static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
out1= _mm512_shuffle_f32x4(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
out2= _mm512_shuffle_f32x4(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
};
static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
out1= _mm512_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
out2= _mm512_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
};
static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
out1= _mm512_shuffle_f64x2(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
out2= _mm512_shuffle_f64x2(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
};
static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
out1 = _mm512_shuffle_pd(in1,in2,0x00);
out2 = _mm512_shuffle_pd(in1,in2,0xFF);
};
static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
assert(0);
return;
};
};
struct Rotate{
static inline __m512 rotate(__m512 in,int n){
switch(n){
case 0: return tRotate<0>(in);break;
case 1: return tRotate<1>(in);break;
case 2: return tRotate<2>(in);break;
case 3: return tRotate<3>(in);break;
case 4: return tRotate<4>(in);break;
case 5: return tRotate<5>(in);break;
case 6: return tRotate<6>(in);break;
case 7: return tRotate<7>(in);break;
case 8 : return tRotate<8>(in);break;
case 9 : return tRotate<9>(in);break;
case 10: return tRotate<10>(in);break;
case 11: return tRotate<11>(in);break;
case 12: return tRotate<12>(in);break;
case 13: return tRotate<13>(in);break;
case 14: return tRotate<14>(in);break;
case 15: return tRotate<15>(in);break;
default: assert(0);
}
}
static inline __m512d rotate(__m512d in,int n){
switch(n){
case 0: return tRotate<0>(in);break;
case 1: return tRotate<1>(in);break;
case 2: return tRotate<2>(in);break;
case 3: return tRotate<3>(in);break;
case 4: return tRotate<4>(in);break;
case 5: return tRotate<5>(in);break;
case 6: return tRotate<6>(in);break;
case 7: return tRotate<7>(in);break;
default: assert(0);
}
}
template<int n> static inline __m512 tRotate(__m512 in){
return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
};
template<int n> static inline __m512d tRotate(__m512d in){
return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);
};
};
//////////////////////////////////////////////
// Some Template specialization
// Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
#ifndef __INTEL_COMPILER
#warning "Slow reduction due to incomplete reduce intrinsics"
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
__m512 v1,v2;
v1=Optimization::Permute::Permute0(in); // avx 512; quad complex single
v1= _mm512_add_ps(v1,in);
v2=Optimization::Permute::Permute1(v1);
v1 = _mm512_add_ps(v1,v2);
v2=Optimization::Permute::Permute2(v1);
v1 = _mm512_add_ps(v1,v2);
u512f conv; conv.v = v1;
return Grid::ComplexF(conv.f[0],conv.f[1]);
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
__m512 v1,v2;
v1 = Optimization::Permute::Permute0(in); // avx 512; octo-double
v1 = _mm512_add_ps(v1,in);
v2 = Optimization::Permute::Permute1(v1);
v1 = _mm512_add_ps(v1,v2);
v2 = Optimization::Permute::Permute2(v1);
v1 = _mm512_add_ps(v1,v2);
v2 = Optimization::Permute::Permute3(v1);
v1 = _mm512_add_ps(v1,v2);
u512f conv; conv.v=v1;
return conv.f[0];
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
__m512d v1;
v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
v1 = _mm512_add_pd(v1,in);
v1 = Optimization::Permute::Permute1(in); // sse 128; paired complex single
v1 = _mm512_add_pd(v1,in);
u512d conv; conv.v = v1;
return Grid::ComplexD(conv.f[0],conv.f[1]);
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
__m512d v1,v2;
v1 = Optimization::Permute::Permute0(in); // avx 512; quad double
v1 = _mm512_add_pd(v1,in);
v2 = Optimization::Permute::Permute1(v1);
v1 = _mm512_add_pd(v1,v2);
v2 = Optimization::Permute::Permute2(v1);
v1 = _mm512_add_pd(v1,v2);
u512d conv; conv.v = v1;
return conv.f[0];
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
// No full vector reduce, use AVX to add upper and lower halves of register
// and perform AVX reduction.
__m256i v1, v2, v3;
__m128i u1, u2, ret;
v1 = _mm512_castsi512_si256(in); // upper half
v2 = _mm512_extracti32x8_epi32(in, 1); // lower half
v3 = _mm256_add_epi32(v1, v2);
v1 = _mm256_hadd_epi32(v3, v3);
v2 = _mm256_hadd_epi32(v1, v1);
u1 = _mm256_castsi256_si128(v2); // upper half
u2 = _mm256_extracti128_si256(v2, 1); // lower half
ret = _mm_add_epi32(u1, u2);
return _mm_cvtsi128_si32(ret);
}
#else
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
return _mm512_reduce_add_ps(in);
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
return _mm512_reduce_add_pd(in);
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
return _mm512_reduce_add_epi32(in);
}
#endif
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
typedef __m512i SIMD_Htype; // Single precision type
typedef __m512 SIMD_Ftype; // Single precision type
typedef __m512d SIMD_Dtype; // Double precision type
typedef __m512i SIMD_Itype; // Integer type
// prefecth
inline void v_prefetch0(int size, const char *ptr){
for(int i=0;i<size;i+=64){ // Define L1 linesize above
_mm_prefetch(ptr+i+4096,_MM_HINT_T1);
_mm_prefetch(ptr+i+512,_MM_HINT_T0);
}
}
inline void prefetch_HINT_T0(const char *ptr){
_mm_prefetch(ptr,_MM_HINT_T0);
}
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}

530
Grid/simd/Grid_generic.h Normal file
View File

@ -0,0 +1,530 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_generic.h
Copyright (C) 2015
Copyright (C) 2017
Author: Antonin Portelli <antonin.portelli@me.com>
Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include "Grid_generic_types.h"
namespace Grid {
namespace Optimization {
struct Vsplat{
// Complex
template <typename T>
inline vec<T> operator()(T a, T b){
vec<T> out;
VECTOR_FOR(i, W<T>::r, 2)
{
out.v[i] = a;
out.v[i+1] = b;
}
return out;
}
// Real
template <typename T>
inline vec<T> operator()(T a){
vec<T> out;
VECTOR_FOR(i, W<T>::r, 1)
{
out.v[i] = a;
}
return out;
}
};
struct Vstore{
// Real
template <typename T>
inline void operator()(vec<T> a, T *D){
*((vec<T> *)D) = a;
}
};
struct Vstream{
// Real
template <typename T>
inline void operator()(T * a, vec<T> b){
*((vec<T> *)a) = b;
}
};
struct Vset{
// Complex
template <typename T>
inline vec<T> operator()(std::complex<T> *a){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
out.v[2*i] = a[i].real();
out.v[2*i+1] = a[i].imag();
}
return out;
}
// Real
template <typename T>
inline vec<T> operator()(T *a){
vec<T> out;
out = *((vec<T> *)a);
return out;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
// Complex/Real
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::r, 1)
{
out.v[i] = a.v[i] + b.v[i];
}
return out;
}
};
struct Sub{
// Complex/Real
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::r, 1)
{
out.v[i] = a.v[i] - b.v[i];
}
return out;
}
};
struct Mult{
// Real
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::r, 1)
{
out.v[i] = a.v[i]*b.v[i];
}
return out;
}
};
#define cmul(a, b, c, i)\
c[i] = a[i]*b[i] - a[i+1]*b[i+1];\
c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
struct MultRealPart{
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
out.v[2*i] = a.v[2*i]*b.v[2*i];
out.v[2*i+1] = a.v[2*i]*b.v[2*i+1];
}
return out;
}
};
struct MaddRealPart{
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b, vec<T> c){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
out.v[2*i] = a.v[2*i]*b.v[2*i] + c.v[2*i];
out.v[2*i+1] = a.v[2*i]*b.v[2*i+1] + c.v[2*i+1];
}
return out;
}
};
struct MultComplex{
// Complex
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
cmul(a.v, b.v, out.v, 2*i);
}
return out;
}
};
#undef cmul
struct Div{
// Real
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::r, 1)
{
out.v[i] = a.v[i]/b.v[i];
}
return out;
}
};
#define conj(a, b, i)\
b[i] = a[i];\
b[i+1] = -a[i+1];
struct Conj{
// Complex
template <typename T>
inline vec<T> operator()(vec<T> a){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
conj(a.v, out.v, 2*i);
}
return out;
}
};
#undef conj
#define timesmi(a, b, i)\
b[i] = a[i+1];\
b[i+1] = -a[i];
struct TimesMinusI{
// Complex
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
timesmi(a.v, out.v, 2*i);
}
return out;
}
};
#undef timesmi
#define timesi(a, b, i)\
b[i] = -a[i+1];\
b[i+1] = a[i];
struct TimesI{
// Complex
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
timesi(a.v, out.v, 2*i);
}
return out;
}
};
#undef timesi
struct PrecisionChange {
static inline vech StoH (const vecf &a,const vecf &b) {
vech ret;
#ifdef USE_FP16
vech *ha = (vech *)&a;
vech *hb = (vech *)&b;
const int nf = W<float>::r;
// VECTOR_FOR(i, nf,1){ ret.v[i] = ( (uint16_t *) &a.v[i])[1] ; }
// VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; }
VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; }
VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; }
#else
assert(0);
#endif
return ret;
}
static inline void HtoS (vech h,vecf &sa,vecf &sb) {
#ifdef USE_FP16
const int nf = W<float>::r;
const int nh = W<uint16_t>::r;
vech *ha = (vech *)&sa;
vech *hb = (vech *)&sb;
VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; }
// VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];}
// VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];}
VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; }
VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; }
#else
assert(0);
#endif
}
static inline vecf DtoS (vecd a,vecd b) {
const int nd = W<double>::r;
const int nf = W<float>::r;
vecf ret;
VECTOR_FOR(i, nd,1){ ret.v[i] = a.v[i] ; }
VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; }
return ret;
}
static inline void StoD (vecf s,vecd &a,vecd &b) {
const int nd = W<double>::r;
VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; }
VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; }
}
static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) {
vecf sa,sb;
sa = DtoS(a,b);
sb = DtoS(c,d);
return StoH(sa,sb);
}
static inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) {
vecf sa,sb;
HtoS(h,sa,sb);
StoD(sa,a,b);
StoD(sb,c,d);
}
};
//////////////////////////////////////////////
// Exchange support
struct Exchange{
template <typename T,int n>
static inline void ExchangeN(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
const int w = W<T>::r;
unsigned int mask = w >> (n + 1);
// std::cout << " Exchange "<<n<<" nsimd "<<w<<" mask 0x" <<std::hex<<mask<<std::dec<<std::endl;
VECTOR_FOR(i, w, 1) {
int j1 = i&(~mask);
if ( (i&mask) == 0 ) { out1.v[i]=in1.v[j1];}
else { out1.v[i]=in2.v[j1];}
int j2 = i|mask;
if ( (i&mask) == 0 ) { out2.v[i]=in1.v[j2];}
else { out2.v[i]=in2.v[j2];}
}
}
template <typename T>
static inline void Exchange0(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
ExchangeN<T,0>(out1,out2,in1,in2);
};
template <typename T>
static inline void Exchange1(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
ExchangeN<T,1>(out1,out2,in1,in2);
};
template <typename T>
static inline void Exchange2(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
ExchangeN<T,2>(out1,out2,in1,in2);
};
template <typename T>
static inline void Exchange3(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
ExchangeN<T,3>(out1,out2,in1,in2);
};
};
//////////////////////////////////////////////
// Some Template specialization
#define perm(a, b, n, w)\
unsigned int _mask = w >> (n + 1);\
VECTOR_FOR(i, w, 1)\
{\
b[i] = a[i^_mask];\
}
#define DECL_PERMUTE_N(n)\
template <typename T>\
static inline vec<T> Permute##n(vec<T> in) {\
vec<T> out;\
perm(in.v, out.v, n, W<T>::r);\
return out;\
}
struct Permute{
DECL_PERMUTE_N(0);
DECL_PERMUTE_N(1);
DECL_PERMUTE_N(2);
DECL_PERMUTE_N(3);
};
#undef perm
#undef DECL_PERMUTE_N
#define rot(a, b, n, w)\
VECTOR_FOR(i, w, 1)\
{\
b[i] = a[(i + n)%w];\
}
struct Rotate{
template <int n, typename T> static inline vec<T> tRotate(vec<T> in){
return rotate(in, n);
}
template <typename T>
static inline vec<T> rotate(vec<T> in, int n){
vec<T> out;
rot(in.v, out.v, n, W<T>::r);
return out;
}
};
#undef rot
#define acc(v, a, off, step, n)\
for (unsigned int i = off; i < n; i += step)\
{\
a += v[i];\
}
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
//Complex float Reduce
template <>
inline Grid::ComplexF Reduce<Grid::ComplexF, vecf>::operator()(vecf in){
float a = 0.f, b = 0.f;
acc(in.v, a, 0, 2, W<float>::r);
acc(in.v, b, 1, 2, W<float>::r);
return Grid::ComplexF(a, b);
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, vecf>::operator()(vecf in){
float a = 0.;
acc(in.v, a, 0, 1, W<float>::r);
return a;
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, vecd>::operator()(vecd in){
double a = 0., b = 0.;
acc(in.v, a, 0, 2, W<double>::r);
acc(in.v, b, 1, 2, W<double>::r);
return Grid::ComplexD(a, b);
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, vecd>::operator()(vecd in){
double a = 0.f;
acc(in.v, a, 0, 1, W<double>::r);
return a;
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, veci>::operator()(veci in){
Integer a = 0;
acc(in.v, a, 0, 1, W<Integer>::r);
return a;
}
#undef acc // EIGEN compatibility
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
typedef Optimization::vech SIMD_Htype; // Reduced precision type
typedef Optimization::vecf SIMD_Ftype; // Single precision type
typedef Optimization::vecd SIMD_Dtype; // Double precision type
typedef Optimization::veci SIMD_Itype; // Integer type
// prefetch utilities
inline void v_prefetch0(int size, const char *ptr){};
inline void prefetch_HINT_T0(const char *ptr){};
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}

View File

@ -0,0 +1,85 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_generic_types.h
Copyright (C) 2017
Author: Antonin Portelli <antonin.portelli@me.com>
Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
//#define VECTOR_LOOPS
// playing with compiler pragmas
#ifdef VECTOR_LOOPS
#ifdef __clang__
#define VECTOR_FOR(i, w, inc)\
_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\
for (unsigned int i = 0; i < w; i += inc)
#elif defined __INTEL_COMPILER
#define VECTOR_FOR(i, w, inc)\
_Pragma("simd vectorlength(w*8)")\
for (unsigned int i = 0; i < w; i += inc)
#else
#define VECTOR_FOR(i, w, inc)\
for (unsigned int i = 0; i < w; i += inc)
#endif
#else
#define VECTOR_FOR(i, w, inc)\
for (unsigned int i = 0; i < w; i += inc)
#endif
namespace Grid {
namespace Optimization {
// type traits giving the number of elements for each vector type
template <typename T> struct W;
template <> struct W<double> {
constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
};
template <> struct W<float> {
constexpr static unsigned int c = GEN_SIMD_WIDTH/8u;
constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
};
template <> struct W<Integer> {
constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
};
template <> struct W<uint16_t> {
constexpr static unsigned int c = GEN_SIMD_WIDTH/4u;
constexpr static unsigned int r = GEN_SIMD_WIDTH/2u;
};
// SIMD vector types
template <typename T>
struct vec {
alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
};
typedef vec<float> vecf;
typedef vec<double> vecd;
typedef vec<uint16_t> vech; // half precision comms
typedef vec<Integer> veci;
}}

448
Grid/simd/Grid_imci.h Normal file
View File

@ -0,0 +1,448 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_imci.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <immintrin.h>
#include <zmmintrin.h>
namespace Grid{
namespace Optimization {
struct Vsplat{
//Complex float
inline __m512 operator()(float a, float b){
return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
}
// Real float
inline __m512 operator()(float a){
return _mm512_set1_ps(a);
}
//Complex double
inline __m512d operator()(double a, double b){
return _mm512_set_pd(b,a,b,a,b,a,b,a);
}
//Real double
inline __m512d operator()(double a){
return _mm512_set1_pd(a);
}
//Integer
inline __m512i operator()(Integer a){
return _mm512_set1_epi32(a);
}
};
struct Vstore{
//Float
inline void operator()(__m512 a, float* F){
_mm512_store_ps(F,a);
}
//Double
inline void operator()(__m512d a, double* D){
_mm512_store_pd(D,a);
}
//Integer
inline void operator()(__m512i a, Integer* I){
_mm512_store_si512((__m512i *)I,a);
}
};
struct Vstream{
//Float
inline void operator()(float * a, __m512 b){
_mm512_storenrngo_ps(a,b);
}
//Double
inline void operator()(double * a, __m512d b){
_mm512_storenrngo_pd(a,b);
}
};
struct Vset{
// Complex float
inline __m512 operator()(Grid::ComplexF *a){
return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
}
// Complex double
inline __m512d operator()(Grid::ComplexD *a){
return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
}
// Real float
inline __m512 operator()(float *a){
return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
// Real double
inline __m512d operator()(double *a){
return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
// Integer
inline __m512i operator()(Integer *a){
return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
};
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
//Complex/Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_add_ps(a,b);
}
//Complex/Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_add_pd(a,b);
}
//Integer
inline __m512i operator()(__m512i a, __m512i b){
return _mm512_add_epi32(a,b);
}
};
struct Sub{
//Complex/Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_sub_ps(a,b);
}
//Complex/Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_sub_pd(a,b);
}
//Integer
inline __m512i operator()(__m512i a, __m512i b){
return _mm512_sub_epi32(a,b);
}
};
struct MultComplex{
// Complex float
inline __m512 operator()(__m512 a, __m512 b){
__m512 vzero,ymm0,ymm1,real, imag;
vzero = _mm512_setzero_ps();
ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); //
real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
ymm1 = _mm512_mul_ps(real, b);
ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
return _mm512_fmadd_ps(ymm0,imag,ymm1);
}
// Complex double
inline __m512d operator()(__m512d a, __m512d b){
/* This is from
* Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets
* @inproceedings{McFarlin:2011:ASV:1995896.1995938,
* author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
* title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
* booktitle = {Proceedings of the International Conference on Supercomputing},
* series = {ICS '11},
* year = {2011},
* isbn = {978-1-4503-0102-2},
* location = {Tucson, Arizona, USA},
* pages = {265--274},
* numpages = {10},
* url = {http://doi.acm.org/10.1145/1995896.1995938},
* doi = {10.1145/1995896.1995938},
* acmid = {1995938},
* publisher = {ACM},
* address = {New York, NY, USA},
* keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
* }
*/
__m512d vzero,ymm0,ymm1,real,imag;
vzero =_mm512_setzero_pd();
ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); //
real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
ymm1 = _mm512_mul_pd(real, b);
ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
return _mm512_fmadd_pd(ymm0,imag,ymm1);
}
};
struct Mult{
inline void mac(__m512 &a, __m512 b, __m512 c){
a= _mm512_fmadd_ps( b, c, a);
}
inline void mac(__m512d &a, __m512d b, __m512d c){
a= _mm512_fmadd_pd( b, c, a);
}
// Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_mul_ps(a,b);
}
// Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_mul_pd(a,b);
}
// Integer
inline __m512i operator()(__m512i a, __m512i b){
return _mm512_mullo_epi32(a,b);
}
};
struct Div{
// Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_div_ps(a,b);
}
// Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_div_pd(a,b);
}
};
struct Conj{
// Complex single
inline __m512 operator()(__m512 in){
return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag
}
// Complex double
inline __m512d operator()(__m512d in){
return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
}
};
struct TimesI{
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
}
};
struct Permute{
static inline __m512 Permute0(__m512 in){
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512 Permute1(__m512 in){
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512 Permute2(__m512 in){
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
};
static inline __m512 Permute3(__m512 in){
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB);
};
static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512d Permute1(__m512d in){
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
};
static inline __m512d Permute2(__m512d in){
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
};
static inline __m512d Permute3(__m512d in){
return in;
};
};
struct Rotate{
static inline __m512 rotate(__m512 in,int n){
switch(n){
case 0: return tRotate<0>(in);break;
case 1: return tRotate<1>(in);break;
case 2: return tRotate<2>(in);break;
case 3: return tRotate<3>(in);break;
case 4: return tRotate<4>(in);break;
case 5: return tRotate<5>(in);break;
case 6: return tRotate<6>(in);break;
case 7: return tRotate<7>(in);break;
case 8 : return tRotate<8>(in);break;
case 9 : return tRotate<9>(in);break;
case 10: return tRotate<10>(in);break;
case 11: return tRotate<11>(in);break;
case 12: return tRotate<12>(in);break;
case 13: return tRotate<13>(in);break;
case 14: return tRotate<14>(in);break;
case 15: return tRotate<15>(in);break;
default: assert(0);
}
}
static inline __m512d rotate(__m512d in,int n){
switch(n){
case 0: return tRotate<0>(in);break;
case 1: return tRotate<1>(in);break;
case 2: return tRotate<2>(in);break;
case 3: return tRotate<3>(in);break;
case 4: return tRotate<4>(in);break;
case 5: return tRotate<5>(in);break;
case 6: return tRotate<6>(in);break;
case 7: return tRotate<7>(in);break;
default: assert(0);
}
}
template<int n> static inline __m512 tRotate(__m512 in){
return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
};
template<int n> static inline __m512d tRotate(__m512d in){
return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);
};
};
//////////////////////////////////////////////
// Some Template specialization
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
return _mm512_reduce_add_ps(in);
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
return _mm512_reduce_add_pd(in);
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
return _mm512_reduce_add_epi32(in);
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
typedef __m512 SIMD_Ftype; // Single precision type
typedef __m512d SIMD_Dtype; // Double precision type
typedef __m512i SIMD_Itype; // Integer type
// prefecth
inline void v_prefetch0(int size, const char *ptr){
for(int i=0;i<size;i+=64){ // Define L1 linesize above
_mm_prefetch(ptr+i+4096,_MM_HINT_T1);
_mm_prefetch(ptr+i+512,_MM_HINT_T0);
}
}
inline void prefetch_HINT_T0(const char *ptr){
_mm_prefetch(ptr,_MM_HINT_T0);
}
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}

599
Grid/simd/Grid_neon.h Normal file
View File

@ -0,0 +1,599 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_neon.h
Copyright (C) 2015
Author: Nils Meyer <nils.meyer@ur.de>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
/*
ARMv8 NEON intrinsics layer by
Nils Meyer <nils.meyer@ur.de>,
University of Regensburg, Germany
SFB/TRR55
*/
#ifndef GEN_SIMD_WIDTH
#define GEN_SIMD_WIDTH 16u
#endif
#include "Grid_generic_types.h"
#include <arm_neon.h>
namespace Grid {
namespace Optimization {
template<class vtype>
union uconv {
float32x4_t f;
vtype v;
};
union u128f {
float32x4_t v;
float f[4];
};
union u128d {
float64x2_t v;
double f[2];
};
// half precision
union u128h {
float16x8_t v;
uint16_t f[8];
};
struct Vsplat{
//Complex float
inline float32x4_t operator()(float a, float b){
float tmp[4]={a,b,a,b};
return vld1q_f32(tmp);
}
// Real float
inline float32x4_t operator()(float a){
return vdupq_n_f32(a);
}
//Complex double
inline float64x2_t operator()(double a, double b){
double tmp[2]={a,b};
return vld1q_f64(tmp);
}
//Real double
inline float64x2_t operator()(double a){
return vdupq_n_f64(a);
}
//Integer
inline uint32x4_t operator()(Integer a){
return vdupq_n_u32(a);
}
};
struct Vstore{
//Float
inline void operator()(float32x4_t a, float* F){
vst1q_f32(F, a);
}
//Double
inline void operator()(float64x2_t a, double* D){
vst1q_f64(D, a);
}
//Integer
inline void operator()(uint32x4_t a, Integer* I){
vst1q_u32(I, a);
}
};
struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
//Float // N:generic
inline void operator()(float * a, float32x4_t b){
memcpy(a,&b,4*sizeof(float));
}
//Double // N:generic
inline void operator()(double * a, float64x2_t b){
memcpy(a,&b,2*sizeof(double));
}
};
// Nils: Vset untested; not used currently in Grid at all;
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
struct Vset{
// Complex float
inline float32x4_t operator()(Grid::ComplexF *a){
float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
return vld1q_f32(tmp);
}
// Complex double
inline float64x2_t operator()(Grid::ComplexD *a){
double tmp[2]={a[0].imag(),a[0].real()};
return vld1q_f64(tmp);
}
// Real float
inline float32x4_t operator()(float *a){
float tmp[4]={a[3],a[2],a[1],a[0]};
return vld1q_f32(tmp);
}
// Real double
inline float64x2_t operator()(double *a){
double tmp[2]={a[1],a[0]};
return vld1q_f64(tmp);
}
// Integer
inline uint32x4_t operator()(Integer *a){
return vld1q_dup_u32(a);
}
};
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
//Complex/Real float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vaddq_f32(a,b);
}
//Complex/Real double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
return vaddq_f64(a,b);
}
//Integer
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
return vaddq_u32(a,b);
}
};
struct Sub{
//Complex/Real float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vsubq_f32(a,b);
}
//Complex/Real double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
return vsubq_f64(a,b);
}
//Integer
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
return vsubq_u32(a,b);
}
};
struct MultRealPart{
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
float32x4_t re = vtrn1q_f32(a, a);
return vmulq_f32(re, b);
}
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
float64x2_t re = vzip1q_f64(a, a);
return vmulq_f64(re, b);
}
};
struct MaddRealPart{
inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
float32x4_t re = vtrn1q_f32(a, a);
return vfmaq_f32(c, re, b);
}
inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
float64x2_t re = vzip1q_f64(a, a);
return vfmaq_f64(c, re, b);
}
};
struct Div{
// Real float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vdivq_f32(a, b);
}
// Real double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
return vdivq_f64(a, b);
}
};
struct MultComplex{
// Complex float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
float32x4_t r0, r1, r2, r3, r4;
// a = ar ai Ar Ai
// b = br bi Br Bi
// collect real/imag part, negate bi and Bi
r0 = vtrn1q_f32(b, b); // br br Br Br
r1 = vnegq_f32(b); // -br -bi -Br -Bi
r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi
// the fun part
r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ...
r4 = vrev64q_f32(r3); // -bi*ai bi*ar ...
// fma(a,b,c) = a+b*c
return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ...
// no fma, use mul and add
// float32x4_t r5;
// r5 = vmulq_f32(r0, a);
// return vaddq_f32(r4, r5);
}
// Complex double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
float64x2_t r0, r1, r2, r3, r4;
// b = br bi
// collect real/imag part, negate bi
r0 = vtrn1q_f64(b, b); // br br
r1 = vnegq_f64(b); // -br -bi
r2 = vtrn2q_f64(b, r1); // bi -bi
// the fun part
r3 = vmulq_f64(r2, a); // bi*ar -bi*ai
r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar
// fma(a,b,c) = a+b*c
return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi
// no fma, use mul and add
// float64x2_t r5;
// r5 = vmulq_f64(r0, a);
// return vaddq_f64(r4, r5);
}
};
struct Mult{
// Real float
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
//return vaddq_f32(vmulq_f32(b,c),a);
return vfmaq_f32(a, b, c);
}
inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
//return vaddq_f64(vmulq_f64(b,c),a);
return vfmaq_f64(a, b, c);
}
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vmulq_f32(a,b);
}
// Real double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
return vmulq_f64(a,b);
}
// Integer
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
return vmulq_u32(a,b);
}
};
struct Conj{
// Complex single
inline float32x4_t operator()(float32x4_t in){
// ar ai br bi -> ar -ai br -bi
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
return vtrn1q_f32(in, r1); // ar -ai br -bi
}
// Complex double
inline float64x2_t operator()(float64x2_t in){
float64x2_t r0, r1;
r0 = vextq_f64(in, in, 1); // ai ar
r1 = vnegq_f64(r0); // -ai -ar
return vextq_f64(r0, r1, 1); // ar -ai
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
// ar ai br bi -> ai -ar ai -br
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
r1 = vrev64q_f32(in); // ai ar bi br
return vtrn1q_f32(r1, r0); // ar -ai br -bi
}
//Complex double
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
// a ib -> b -ia
float64x2_t tmp;
tmp = vnegq_f64(in);
return vextq_f64(in, tmp, 1);
}
};
struct TimesI{
//Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
// ar ai br bi -> -ai ar -bi br
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
return vtrn1q_f32(r1, in); // -ai ar -bi br
}
//Complex double
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
// a ib -> -b ia
float64x2_t tmp;
tmp = vnegq_f64(in);
return vextq_f64(tmp, in, 1);
}
};
struct Permute{
static inline float32x4_t Permute0(float32x4_t in){ // N:ok
// AB CD -> CD AB
return vextq_f32(in, in, 2);
};
static inline float32x4_t Permute1(float32x4_t in){ // N:ok
// AB CD -> BA DC
return vrev64q_f32(in);
};
static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
return in;
};
static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
return in;
};
static inline float64x2_t Permute0(float64x2_t in){ // N:ok
// AB -> BA
return vextq_f64(in, in, 1);
};
static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
return in;
};
static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
return in;
};
static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
return in;
};
};
struct Rotate{
static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
switch(n){
case 0: // AB CD -> AB CD
return tRotate<0>(in);
break;
case 1: // AB CD -> BC DA
return tRotate<1>(in);
break;
case 2: // AB CD -> CD AB
return tRotate<2>(in);
break;
case 3: // AB CD -> DA BC
return tRotate<3>(in);
break;
default: assert(0);
}
}
static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
switch(n){
case 0: // AB -> AB
return tRotate<0>(in);
break;
case 1: // AB -> BA
return tRotate<1>(in);
break;
default: assert(0);
}
}
template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
};
struct PrecisionChange {
static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
float16x4_t h = vcvt_f16_f32(a);
return vcvt_high_f16_f32(h, b);
}
static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
sb = vcvt_high_f32_f16(h);
// there is no direct conversion from lower float32x4_t to float64x2_t
// vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
// float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
// workaround for clang
uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
sa = vcvt_high_f32_f16(h1);
}
static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
float32x2_t s = vcvt_f32_f64(a);
return vcvt_high_f32_f64(s, b);
}
static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
b = vcvt_high_f64_f32(s);
// there is no direct conversion from lower float32x4_t to float64x2_t
float32x4_t s1 = vextq_f32(s, s, 2);
a = vcvt_high_f64_f32(s1);
}
static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
float32x4_t s1 = DtoS(a, b);
float32x4_t s2 = DtoS(c, d);
return StoH(s1, s2);
}
static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
float32x4_t s1, s2;
HtoS(h, s1, s2);
StoD(s1, a, b);
StoD(s2, c, d);
}
};
//////////////////////////////////////////////
// Exchange support
struct Exchange{
static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
// in1: ABCD -> out1: ABEF
// in2: EFGH -> out2: CDGH
// z: CDAB
float32x4_t z = vextq_f32(in1, in1, 2);
// out1: ABEF
out1 = vextq_f32(z, in2, 2);
// z: GHEF
z = vextq_f32(in2, in2, 2);
// out2: CDGH
out2 = vextq_f32(in1, z, 2);
};
static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
// in1: ABCD -> out1: AECG
// in2: EFGH -> out2: BFDH
out1 = vtrn1q_f32(in1, in2);
out2 = vtrn2q_f32(in1, in2);
};
static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
assert(0);
return;
};
static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
assert(0);
return;
};
// double precision
static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
// in1: AB -> out1: AC
// in2: CD -> out2: BD
out1 = vzip1q_f64(in1, in2);
out2 = vzip2q_f64(in1, in2);
};
static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
assert(0);
return;
};
static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
assert(0);
return;
};
static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
assert(0);
return;
};
};
//////////////////////////////////////////////
// Some Template specialization
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
float32x4_t v1; // two complex
v1 = Optimization::Permute::Permute0(in);
v1 = vaddq_f32(v1,in);
u128f conv; conv.v=v1;
return Grid::ComplexF(conv.f[0],conv.f[1]);
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
return vaddvq_f32(in);
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
u128d conv; conv.v = in;
return Grid::ComplexD(conv.f[0],conv.f[1]);
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
return vaddvq_f64(in);
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
return vaddvq_u32(in);
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
// typedef Optimization::vech SIMD_Htype; // Reduced precision type
typedef float16x8_t SIMD_Htype; // Half precision type
typedef float32x4_t SIMD_Ftype; // Single precision type
typedef float64x2_t SIMD_Dtype; // Double precision type
typedef uint32x4_t SIMD_Itype; // Integer type
inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities
inline void prefetch_HINT_T0(const char *ptr){};
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}

619
Grid/simd/Grid_qpx.h Normal file
View File

@ -0,0 +1,619 @@
/*******************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_qpx.h
Copyright (C) 2016
Copyright (C) 2017
Author: Antonin Portelli <antonin.portelli@me.com>
Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
******************************************************************************/
#ifndef GEN_SIMD_WIDTH
#define GEN_SIMD_WIDTH 32u
#endif
#include "Grid_generic_types.h" // Definitions for simulated integer SIMD.
namespace Grid {
#ifdef QPX
#include <spi/include/kernel/location.h>
#include <spi/include/l1p/types.h>
#include <hwi/include/bqc/l1p_mmio.h>
#include <hwi/include/bqc/A2_inlines.h>
#endif
namespace Optimization {
typedef struct
{
float v0,v1,v2,v3;
} vector4float;
inline std::ostream & operator<<(std::ostream& stream, const vector4double a)
{
stream << "{"<<vec_extract(a,0)<<","<<vec_extract(a,1)<<","<<vec_extract(a,2)<<","<<vec_extract(a,3)<<"}";
return stream;
};
inline std::ostream & operator<<(std::ostream& stream, const vector4float a)
{
stream << "{"<< a.v0 <<","<< a.v1 <<","<< a.v2 <<","<< a.v3 <<"}";
return stream;
};
struct Vsplat{
//Complex float
inline vector4float operator()(float a, float b){
return (vector4float){a, b, a, b};
}
// Real float
inline vector4float operator()(float a){
return (vector4float){a, a, a, a};
}
//Complex double
inline vector4double operator()(double a, double b){
return (vector4double){a, b, a, b};
}
//Real double
inline vector4double operator()(double a){
return (vector4double){a, a, a, a};
}
//Integer
inline veci operator()(Integer a){
veci out;
VECTOR_FOR(i, W<Integer>::r, 1)
{
out.v[i] = a;
}
return out;
}
};
struct Vstore{
//Float
inline void operator()(vector4double a, float *f){
vec_st(a, 0, f);
}
inline void operator()(vector4double a, vector4float &f){
vec_st(a, 0, (float *)(&f));
}
inline void operator()(vector4float a, float *f){
f[0] = a.v0;
f[1] = a.v1;
f[2] = a.v2;
f[3] = a.v3;
}
//Double
inline void operator()(vector4double a, double *d){
vec_st(a, 0, d);
}
//Integer
inline void operator()(veci a, Integer *i){
*((veci *)i) = a;
}
};
struct Vstream{
//Float
inline void operator()(float *f, vector4double a){
vec_st(a, 0, f);
}
inline void operator()(vector4float f, vector4double a){
vec_st(a, 0, (float *)(&f));
}
inline void operator()(float *f, vector4float a){
f[0] = a.v0;
f[1] = a.v1;
f[2] = a.v2;
f[3] = a.v3;
}
//Double
inline void operator()(double *d, vector4double a){
vec_st(a, 0, d);
}
};
struct Vset{
// Complex float
inline vector4float operator()(Grid::ComplexF *a){
return (vector4float){a[0].real(), a[0].imag(), a[1].real(), a[1].imag()};
}
// Complex double
inline vector4double operator()(Grid::ComplexD *a){
return vec_ld(0, (double *)a);
}
// Real float
inline vector4float operator()(float *a){
return (vector4float){a[0], a[1], a[2], a[3]};
}
inline vector4double operator()(vector4float a){
return vec_ld(0, (float *)(&a));
}
// Real double
inline vector4double operator()(double *a){
return vec_ld(0, a);
}
// Integer
inline veci operator()(Integer *a){
veci out;
out = *((veci *)a);
return out;
}
};
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
#define FLOAT_WRAP_3(fn, pref)\
pref vector4float fn(vector4float a, vector4float b, vector4float c) \
{\
vector4double ad, bd, rd, cd; \
vector4float r;\
\
ad = Vset()(a);\
bd = Vset()(b);\
cd = Vset()(c);\
rd = fn(ad, bd, cd); \
Vstore()(rd, r);\
\
return r;\
}
#define FLOAT_WRAP_2(fn, pref)\
pref vector4float fn(vector4float a, vector4float b)\
{\
vector4double ad, bd, rd;\
vector4float r;\
\
ad = Vset()(a);\
bd = Vset()(b);\
rd = fn(ad, bd);\
Vstore()(rd, r);\
\
return r;\
}
#define FLOAT_WRAP_1(fn, pref)\
pref vector4float fn(vector4float a)\
{\
vector4double ad, rd;\
vector4float r;\
\
ad = Vset()(a);\
rd = fn(ad);\
Vstore()(rd, r);\
\
return r;\
}
struct Sum{
//Complex/Real double
inline vector4double operator()(vector4double a, vector4double b){
return vec_add(a, b);
}
//Complex/Real float
FLOAT_WRAP_2(operator(), inline)
//Integer
inline veci operator()(veci a, veci b){
veci out;
VECTOR_FOR(i, W<Integer>::r, 1)
{
out.v[i] = a.v[i] + b.v[i];
}
return out;
}
};
struct Sub{
//Complex/Real double
inline vector4double operator()(vector4double a, vector4double b){
return vec_sub(a, b);
}
//Complex/Real float
FLOAT_WRAP_2(operator(), inline)
//Integer
inline veci operator()(veci a, veci b){
veci out;
VECTOR_FOR(i, W<Integer>::r, 1)
{
out.v[i] = a.v[i] - b.v[i];
}
return out;
}
};
struct MultRealPart{
// Complex double
inline vector4double operator()(vector4double a, vector4double b){
// return vec_xmul(b, a);
return vec_xmul(a, b);
}
FLOAT_WRAP_2(operator(), inline)
};
struct MaddRealPart{
// Complex double
inline vector4double operator()(vector4double a, vector4double b,vector4double c){
return vec_xmadd(a, b, c);
}
FLOAT_WRAP_3(operator(), inline)
};
struct MultComplex{
// Complex double
inline vector4double operator()(vector4double a, vector4double b){
return vec_xxnpmadd(a, b, vec_xmul(b, a));
}
// Complex float
FLOAT_WRAP_2(operator(), inline)
};
struct Mult{
// Real double
inline vector4double operator()(vector4double a, vector4double b){
return vec_mul(a, b);
}
// Real float
FLOAT_WRAP_2(operator(), inline)
// Integer
inline veci operator()(veci a, veci b){
veci out;
VECTOR_FOR(i, W<Integer>::r, 1)
{
out.v[i] = a.v[i]*b.v[i];
}
return out;
}
};
struct Div{
// Real double
inline vector4double operator()(vector4double a, vector4double b){
return vec_swdiv(a, b);
}
// Real float
FLOAT_WRAP_2(operator(), inline)
// Integer
inline veci operator()(veci a, veci b){
veci out;
VECTOR_FOR(i, W<Integer>::r, 1)
{
out.v[i] = a.v[i]/b.v[i];
}
return out;
}
};
struct Conj{
// Complex double
inline vector4double operator()(vector4double v){
return vec_mul(v, (vector4double){1., -1., 1., -1.});
}
// Complex float
FLOAT_WRAP_1(operator(), inline)
};
struct TimesMinusI{
//Complex double
inline vector4double operator()(vector4double v, vector4double ret){
return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
(vector4double){0., 0., 0., 0.});
}
// Complex float
FLOAT_WRAP_2(operator(), inline)
};
struct TimesI{
//Complex double
inline vector4double operator()(vector4double v, vector4double ret){
return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
(vector4double){0., 0., 0., 0.});
}
// Complex float
FLOAT_WRAP_2(operator(), inline)
};
#define USE_FP16
struct PrecisionChange {
static inline vech StoH (const vector4float &a, const vector4float &b) {
vech ret;
std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl;
assert(0);
return ret;
}
static inline void HtoS (vech h, vector4float &sa, vector4float &sb) {
std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl;
assert(0);
}
static inline vector4float DtoS (vector4double a, vector4double b) {
vector4float ret;
std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl;
assert(0);
return ret;
}
static inline void StoD (vector4float s, vector4double &a, vector4double &b) {
std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl;
assert(0);
}
static inline vech DtoH (vector4double a, vector4double b,
vector4double c, vector4double d) {
vech ret;
std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl;
assert(0);
return ret;
}
static inline void HtoD (vech h, vector4double &a, vector4double &b,
vector4double &c, vector4double &d) {
std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl;
assert(0);
}
};
//////////////////////////////////////////////
// Exchange support
#define FLOAT_WRAP_EXCHANGE(fn) \
static inline void fn(vector4float &out1, vector4float &out2, \
vector4float in1, vector4float in2) \
{ \
vector4double out1d, out2d, in1d, in2d; \
in1d = Vset()(in1); \
in2d = Vset()(in2); \
fn(out1d, out2d, in1d, in2d); \
Vstore()(out1d, out1); \
Vstore()(out2d, out2); \
}
struct Exchange{
// double precision
static inline void Exchange0(vector4double &out1, vector4double &out2,
vector4double in1, vector4double in2) {
out1 = vec_perm(in1, in2, vec_gpci(0145));
out2 = vec_perm(in1, in2, vec_gpci(02367));
}
static inline void Exchange1(vector4double &out1, vector4double &out2,
vector4double in1, vector4double in2) {
out1 = vec_perm(in1, in2, vec_gpci(0426));
out2 = vec_perm(in1, in2, vec_gpci(01537));
}
static inline void Exchange2(vector4double &out1, vector4double &out2,
vector4double in1, vector4double in2) {
assert(0);
}
static inline void Exchange3(vector4double &out1, vector4double &out2,
vector4double in1, vector4double in2) {
assert(0);
}
// single precision
FLOAT_WRAP_EXCHANGE(Exchange0);
FLOAT_WRAP_EXCHANGE(Exchange1);
FLOAT_WRAP_EXCHANGE(Exchange2);
FLOAT_WRAP_EXCHANGE(Exchange3);
};
struct Permute{
//Complex double
static inline vector4double Permute0(vector4double v){ //0123 -> 2301
return vec_perm(v, v, vec_gpci(02301));
};
static inline vector4double Permute1(vector4double v){ //0123 -> 1032
return vec_perm(v, v, vec_gpci(01032));
};
static inline vector4double Permute2(vector4double v){
return v;
};
static inline vector4double Permute3(vector4double v){
return v;
};
// Complex float
FLOAT_WRAP_1(Permute0, static inline)
FLOAT_WRAP_1(Permute1, static inline)
FLOAT_WRAP_1(Permute2, static inline)
FLOAT_WRAP_1(Permute3, static inline)
};
struct Rotate{
template<int n> static inline vector4double tRotate(vector4double v){
if ( n==1 ) return vec_perm(v, v, vec_gpci(01230));
if ( n==2 ) return vec_perm(v, v, vec_gpci(02301));
if ( n==3 ) return vec_perm(v, v, vec_gpci(03012));
return v;
};
template<int n> static inline vector4float tRotate(vector4float a)
{
vector4double ad, rd;
vector4float r;
ad = Vset()(a);
rd = tRotate<n>(ad);
Vstore()(rd, r);
return r;
};
static inline vector4double rotate(vector4double v, int n){
switch(n){
case 0:
return v;
break;
case 1:
return tRotate<1>(v);
break;
case 2:
return tRotate<2>(v);
break;
case 3:
return tRotate<3>(v);
break;
default: assert(0);
}
}
static inline vector4float rotate(vector4float v, int n){
vector4double vd, rd;
vector4float r;
vd = Vset()(v);
rd = rotate(vd, n);
Vstore()(rd, r);
return r;
}
};
//Complex float Reduce
template<>
inline Grid::ComplexF
Reduce<Grid::ComplexF, vector4float>::operator()(vector4float v) { //2 complex
vector4float v1,v2;
v1 = Optimization::Permute::Permute0(v);
v1 = Optimization::Sum()(v1, v);
return Grid::ComplexF(v1.v0, v1.v1);
}
//Real float Reduce
template<>
inline Grid::RealF
Reduce<Grid::RealF, vector4float>::operator()(vector4float v){ //4 floats
vector4float v1,v2;
v1 = Optimization::Permute::Permute0(v);
v1 = Optimization::Sum()(v1, v);
v2 = Optimization::Permute::Permute1(v1);
v1 = Optimization::Sum()(v1, v2);
return v1.v0;
}
//Complex double Reduce
template<>
inline Grid::ComplexD
Reduce<Grid::ComplexD, vector4double>::operator()(vector4double v){ //2 complex
vector4double v1;
v1 = Optimization::Permute::Permute0(v);
v1 = vec_add(v1, v);
return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1));
}
//Real double Reduce
template<>
inline Grid::RealD
Reduce<Grid::RealD, vector4double>::operator()(vector4double v){ //4 doubles
vector4double v1,v2;
v1 = Optimization::Permute::Permute0(v);
v1 = vec_add(v1, v);
v2 = Optimization::Permute::Permute1(v1);
v1 = vec_add(v1, v2);
return vec_extract(v1, 0);
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, veci>::operator()(veci in){
Integer a = 0;
for (unsigned int i = 0; i < W<Integer>::r; ++i)
{
a += in.v[i];
}
return a;
}
}
////////////////////////////////////////////////////////////////////////////////
// Here assign types
typedef Optimization::vech SIMD_Htype; // Half precision type
typedef Optimization::vector4float SIMD_Ftype; // Single precision type
typedef vector4double SIMD_Dtype; // Double precision type
typedef Optimization::veci SIMD_Itype; // Integer type
// prefetch utilities
inline void v_prefetch0(int size, const char *ptr){};
inline void prefetch_HINT_T0(const char *ptr){};
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}

617
Grid/simd/Grid_sse4.h Normal file
View File

@ -0,0 +1,617 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_sse4.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
//----------------------------------------------------------------------
/*! @file Grid_sse4.h
@brief Optimization libraries for SSE4 instructions set
Using intrinsics
*/
// Time-stamp: <2015-06-16 23:27:54 neo>
//----------------------------------------------------------------------
#include <pmmintrin.h>
namespace Grid {
namespace Optimization {
template<class vtype>
union uconv {
__m128 f;
vtype v;
};
union u128f {
__m128 v;
float f[4];
};
union u128d {
__m128d v;
double f[2];
};
struct Vsplat{
//Complex float
inline __m128 operator()(float a, float b){
return _mm_set_ps(b,a,b,a);
}
// Real float
inline __m128 operator()(float a){
return _mm_set_ps(a,a,a,a);
}
//Complex double
inline __m128d operator()(double a, double b){
return _mm_set_pd(b,a);
}
//Real double
inline __m128d operator()(double a){
return _mm_set_pd(a,a);
}
//Integer
inline __m128i operator()(Integer a){
return _mm_set1_epi32(a);
}
};
struct Vstore{
//Float
inline void operator()(__m128 a, float* F){
_mm_store_ps(F,a);
}
//Double
inline void operator()(__m128d a, double* D){
_mm_store_pd(D,a);
}
//Integer
inline void operator()(__m128i a, Integer* I){
_mm_store_si128((__m128i *)I,a);
}
};
struct Vstream{
//Float
inline void operator()(float * a, __m128 b){
_mm_stream_ps(a,b);
}
//Double
inline void operator()(double * a, __m128d b){
_mm_stream_pd(a,b);
}
};
struct Vset{
// Complex float
inline __m128 operator()(Grid::ComplexF *a){
return _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
}
// Complex double
inline __m128d operator()(Grid::ComplexD *a){
return _mm_set_pd(a[0].imag(),a[0].real());
}
// Real float
inline __m128 operator()(float *a){
return _mm_set_ps(a[3],a[2],a[1],a[0]);
}
// Real double
inline __m128d operator()(double *a){
return _mm_set_pd(a[1],a[0]);
}
// Integer
inline __m128i operator()(Integer *a){
return _mm_set_epi32(a[3],a[2],a[1],a[0]);
}
};
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
//Complex/Real float
inline __m128 operator()(__m128 a, __m128 b){
return _mm_add_ps(a,b);
}
//Complex/Real double
inline __m128d operator()(__m128d a, __m128d b){
return _mm_add_pd(a,b);
}
//Integer
inline __m128i operator()(__m128i a, __m128i b){
return _mm_add_epi32(a,b);
}
};
struct Sub{
//Complex/Real float
inline __m128 operator()(__m128 a, __m128 b){
return _mm_sub_ps(a,b);
}
//Complex/Real double
inline __m128d operator()(__m128d a, __m128d b){
return _mm_sub_pd(a,b);
}
//Integer
inline __m128i operator()(__m128i a, __m128i b){
return _mm_sub_epi32(a,b);
}
};
struct MultRealPart{
inline __m128 operator()(__m128 a, __m128 b){
__m128 ymm0;
ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
}
inline __m128d operator()(__m128d a, __m128d b){
__m128d ymm0;
ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
}
};
struct MaddRealPart{
inline __m128 operator()(__m128 a, __m128 b, __m128 c){
__m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
return _mm_add_ps(_mm_mul_ps( ymm0, b),c);
}
inline __m128d operator()(__m128d a, __m128d b, __m128d c){
__m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 );
return _mm_add_pd(_mm_mul_pd( ymm0, b),c);
}
};
struct MultComplex{
// Complex float
inline __m128 operator()(__m128 a, __m128 b){
__m128 ymm0,ymm1,ymm2;
ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm_addsub_ps(ymm0,ymm1);
}
// Complex double
inline __m128d operator()(__m128d a, __m128d b){
__m128d ymm0,ymm1,ymm2;
ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar,
ymm0 = _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
ymm1 = _mm_shuffle_pd(b,b,0x1); // ymm1 <- br,bi b01
ymm2 = _mm_shuffle_pd(a,a,0x3); // ymm2 <- ai,ai b11
ymm1 = _mm_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm_addsub_pd(ymm0,ymm1);
}
};
struct Mult{
inline void mac(__m128 &a, __m128 b, __m128 c){
a= _mm_add_ps(_mm_mul_ps(b,c),a);
}
inline void mac(__m128d &a, __m128d b, __m128d c){
a= _mm_add_pd(_mm_mul_pd(b,c),a);
}
// Real float
inline __m128 operator()(__m128 a, __m128 b){
return _mm_mul_ps(a,b);
}
// Real double
inline __m128d operator()(__m128d a, __m128d b){
return _mm_mul_pd(a,b);
}
// Integer
inline __m128i operator()(__m128i a, __m128i b){
return _mm_mullo_epi32(a,b);
}
};
struct Div{
// Real float
inline __m128 operator()(__m128 a, __m128 b){
return _mm_div_ps(a,b);
}
// Real double
inline __m128d operator()(__m128d a, __m128d b){
return _mm_div_pd(a,b);
}
};
struct Conj{
// Complex single
inline __m128 operator()(__m128 in){
return _mm_xor_ps(_mm_addsub_ps(_mm_setzero_ps(),in), _mm_set1_ps(-0.f));
}
// Complex double
inline __m128d operator()(__m128d in){
return _mm_xor_pd(_mm_addsub_pd(_mm_setzero_pd(),in), _mm_set1_pd(-0.f));//untested
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline __m128 operator()(__m128 in, __m128 ret){
__m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
}
//Complex double
inline __m128d operator()(__m128d in, __m128d ret){
__m128d tmp =_mm_addsub_pd(_mm_setzero_pd(),in); // r,-i
return _mm_shuffle_pd(tmp,tmp,0x1);
}
};
struct TimesI{
//Complex single
inline __m128 operator()(__m128 in, __m128 ret){
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
}
//Complex double
inline __m128d operator()(__m128d in, __m128d ret){
__m128d tmp = _mm_shuffle_pd(in,in,0x1);
return _mm_addsub_pd(_mm_setzero_pd(),tmp); // r,-i
}
};
struct Permute{
static inline __m128 Permute0(__m128 in){
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
};
static inline __m128 Permute1(__m128 in){
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
};
static inline __m128 Permute2(__m128 in){
return in;
};
static inline __m128 Permute3(__m128 in){
return in;
};
static inline __m128d Permute0(__m128d in){ //AB -> BA
return _mm_shuffle_pd(in,in,0x1);
};
static inline __m128d Permute1(__m128d in){
return in;
};
static inline __m128d Permute2(__m128d in){
return in;
};
static inline __m128d Permute3(__m128d in){
return in;
};
};
#define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
#define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
#ifdef SFW_FP16
struct Grid_half {
Grid_half(){}
Grid_half(uint16_t raw) : x(raw) {}
uint16_t x;
};
union FP32 {
unsigned int u;
float f;
};
// PAB - Lifted and adapted from Eigen, which is GPL V2
inline float sfw_half_to_float(Grid_half h) {
const FP32 magic = { 113 << 23 };
const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
FP32 o;
o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
unsigned int exp = shifted_exp & o.u; // just the exponent
o.u += (127 - 15) << 23; // exponent adjust
// handle exponent special cases
if (exp == shifted_exp) { // Inf/NaN?
o.u += (128 - 16) << 23; // extra exp adjust
} else if (exp == 0) { // Zero/Denormal?
o.u += 1 << 23; // extra exp adjust
o.f -= magic.f; // renormalize
}
o.u |= (h.x & 0x8000) << 16; // sign bit
return o.f;
}
inline Grid_half sfw_float_to_half(float ff) {
FP32 f; f.f = ff;
const FP32 f32infty = { 255 << 23 };
const FP32 f16max = { (127 + 16) << 23 };
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
unsigned int sign_mask = 0x80000000u;
Grid_half o;
o.x = static_cast<unsigned short>(0x0u);
unsigned int sign = f.u & sign_mask;
f.u ^= sign;
// NOTE all the integer compares in this function can be safely
// compiled into signed compares since all operands are below
// 0x80000000. Important if you want fast straight SSE2 code
// (since there's no unsigned PCMPGTD).
if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
} else { // (De)normalized number or zero
if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
// use a magic value to align our 10 mantissa bits at the bottom of
// the float. as long as FP addition is round-to-nearest-even this
// just works.
f.f += denorm_magic.f;
// and one integer subtract of the bias later, we have our final float!
o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
} else {
unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
// update exponent, rounding bias part 1
f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
// rounding bias part 2
f.u += mant_odd;
// take the bits!
o.x = static_cast<unsigned short>(f.u >> 13);
}
}
o.x |= static_cast<unsigned short>(sign >> 16);
return o;
}
static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) {
__m128i ret=(__m128i)_mm_setzero_ps();
float *fp = (float *)&f;
Grid_half *hp = (Grid_half *)&ret;
hp[0] = sfw_float_to_half(fp[0]);
hp[1] = sfw_float_to_half(fp[1]);
hp[2] = sfw_float_to_half(fp[2]);
hp[3] = sfw_float_to_half(fp[3]);
return ret;
}
static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) {
__m128 ret=_mm_setzero_ps();
float *fp = (float *)&ret;
Grid_half *hp = (Grid_half *)&h;
fp[0] = sfw_half_to_float(hp[0]);
fp[1] = sfw_half_to_float(hp[1]);
fp[2] = sfw_half_to_float(hp[2]);
fp[3] = sfw_half_to_float(hp[3]);
return ret;
}
#else
#define Grid_mm_cvtps_ph _mm_cvtps_ph
#define Grid_mm_cvtph_ps _mm_cvtph_ps
#endif
struct PrecisionChange {
static inline __m128i StoH (__m128 a,__m128 b) {
__m128i ha = Grid_mm_cvtps_ph(a,0);
__m128i hb = Grid_mm_cvtps_ph(b,0);
__m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
return h;
}
static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) {
sa = Grid_mm_cvtph_ps(h,0);
h = (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2);
sb = Grid_mm_cvtph_ps(h,0);
}
static inline __m128 DtoS (__m128d a,__m128d b) {
__m128 sa = _mm_cvtpd_ps(a);
__m128 sb = _mm_cvtpd_ps(b);
__m128 s = _mm_shuffle_ps(sa,sb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
return s;
}
static inline void StoD (__m128 s,__m128d &a,__m128d &b) {
a = _mm_cvtps_pd(s);
s = (__m128)_my_alignr_epi32((__m128i)s,(__m128i)s,2);
b = _mm_cvtps_pd(s);
}
static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) {
__m128 sa,sb;
sa = DtoS(a,b);
sb = DtoS(c,d);
return StoH(sa,sb);
}
static inline void HtoD (__m128i h,__m128d &a,__m128d &b,__m128d &c,__m128d &d) {
__m128 sa,sb;
HtoS(h,sa,sb);
StoD(sa,a,b);
StoD(sb,c,d);
}
};
struct Exchange{
// 3210 ordering
static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/
out1= _mm_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
out2= _mm_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
};
static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
assert(0);
return;
};
static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
assert(0);
return;
};
static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
out1= _mm_shuffle_pd(in1,in2,0x0);
out2= _mm_shuffle_pd(in1,in2,0x3);
};
static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
assert(0);
return;
};
static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
assert(0);
return;
};
static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
assert(0);
return;
};
};
struct Rotate{
static inline __m128 rotate(__m128 in,int n){
switch(n){
case 0: return tRotate<0>(in);break;
case 1: return tRotate<1>(in);break;
case 2: return tRotate<2>(in);break;
case 3: return tRotate<3>(in);break;
default: assert(0);
}
}
static inline __m128d rotate(__m128d in,int n){
switch(n){
case 0: return tRotate<0>(in);break;
case 1: return tRotate<1>(in);break;
default: assert(0);
}
}
template<int n> static inline __m128 tRotate(__m128 in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); };
template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); };
};
//////////////////////////////////////////////
// Some Template specialization
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
__m128 v1; // two complex
v1= Optimization::Permute::Permute0(in);
v1= _mm_add_ps(v1,in);
u128f conv; conv.v=v1;
return Grid::ComplexF(conv.f[0],conv.f[1]);
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
__m128 v1,v2; // quad single
v1= Optimization::Permute::Permute0(in);
v1= _mm_add_ps(v1,in);
v2= Optimization::Permute::Permute1(v1);
v1 = _mm_add_ps(v1,v2);
u128f conv; conv.v=v1;
return conv.f[0];
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, __m128d>::operator()(__m128d in){
u128d conv; conv.v = in;
return Grid::ComplexD(conv.f[0],conv.f[1]);
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
__m128d v1;
v1 = Optimization::Permute::Permute0(in);
v1 = _mm_add_pd(v1,in);
u128d conv; conv.v = v1;
return conv.f[0];
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
__m128i v1 = _mm_hadd_epi32(in, in);
__m128i v2 = _mm_hadd_epi32(v1, v1);
return _mm_cvtsi128_si32(v2);
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
typedef __m128i SIMD_Htype; // Single precision type
typedef __m128 SIMD_Ftype; // Single precision type
typedef __m128d SIMD_Dtype; // Double precision type
typedef __m128i SIMD_Itype; // Integer type
// prefetch utilities
inline void v_prefetch0(int size, const char *ptr){};
inline void prefetch_HINT_T0(const char *ptr){
_mm_prefetch(ptr,_MM_HINT_T0);
}
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}

View File

@ -0,0 +1,868 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_vector_type.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
//---------------------------------------------------------------------------
/*! @file Grid_vector_types.h
@brief Defines templated class Grid_simd to deal with inner vector types
*/
// Time-stamp: <2015-07-10 17:45:33 neo>
//---------------------------------------------------------------------------
#ifndef GRID_VECTOR_TYPES
#define GRID_VECTOR_TYPES
#ifdef GEN
#include "Grid_generic.h"
#endif
#ifdef SSE4
#include "Grid_sse4.h"
#endif
#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
#include "Grid_avx.h"
#endif
#if defined AVX512
#include "Grid_avx512.h"
#endif
#if defined IMCI
#include "Grid_imci.h"
#endif
#ifdef NEONV8
#include "Grid_neon.h"
#endif
#if defined QPX
#include "Grid_qpx.h"
#endif
#include "l1p.h"
namespace Grid {
//////////////////////////////////////
// To take the floating point type of real/complex type
//////////////////////////////////////
template <typename T>
struct RealPart {
typedef T type;
};
template <typename T>
struct RealPart<std::complex<T> > {
typedef T type;
};
#include <type_traits>
//////////////////////////////////////
// demote a vector to real type
//////////////////////////////////////
// type alias used to simplify the syntax of std::enable_if
template <typename T> using Invoke = typename T::type;
template <typename Condition, typename ReturnType> using EnableIf = Invoke<std::enable_if<Condition::value, ReturnType> >;
template <typename Condition, typename ReturnType> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
////////////////////////////////////////////////////////
// Check for complexity with type traits
template <typename T> struct is_complex : public std::false_type {};
template <> struct is_complex<std::complex<double> > : public std::true_type {};
template <> struct is_complex<std::complex<float> > : public std::true_type {};
template <typename T> using IfReal = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
template <typename T> using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
template <typename T> using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
template <typename T1,typename T2> using IfSame = Invoke<std::enable_if<std::is_same<T1,T2>::value, int> >;
template <typename T> using IfNotReal = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
template <typename T> using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
template <typename T> using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
template <typename T1,typename T2> using IfNotSame = Invoke<std::enable_if<!std::is_same<T1,T2>::value, int> >;
////////////////////////////////////////////////////////
// Define the operation templates functors
// general forms to allow for vsplat syntax
// need explicit declaration of types when used since
// clang cannot automatically determine the output type sometimes
template <class Out, class Input1, class Input2, class Input3, class Operation>
Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
return op(src_1, src_2, src_3);
}
template <class Out, class Input1, class Input2, class Operation>
Out binary(Input1 src_1, Input2 src_2, Operation op) {
return op(src_1, src_2);
}
template <class Out, class Input, class Operation>
Out unary(Input src, Operation op) {
return op(src);
}
///////////////////////////////////////////////
/*
@brief Grid_simd class for the SIMD vector type operations
*/
template <class Scalar_type, class Vector_type>
class Grid_simd {
public:
typedef typename RealPart<Scalar_type>::type Real;
typedef Vector_type vector_type;
typedef Scalar_type scalar_type;
typedef union conv_t_union {
Vector_type v;
Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
conv_t_union(){};
} conv_t;
Vector_type v;
static inline constexpr int Nsimd(void) {
return sizeof(Vector_type) / sizeof(Scalar_type);
}
Grid_simd &operator=(const Grid_simd &&rhs) {
v = rhs.v;
return *this;
};
Grid_simd &operator=(const Grid_simd &rhs) {
v = rhs.v;
return *this;
}; // faster than not declaring it and leaving to the compiler
Grid_simd() = default;
Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps
Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
/////////////////////////////
// Constructors
/////////////////////////////
Grid_simd &operator=(Zero &z) {
vzero(*this);
return (*this);
}
// Enable if complex type
template <typename S = Scalar_type>
Grid_simd(const typename std::enable_if<is_complex<S>::value, S>::type a) {
vsplat(*this, a);
};
Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
///////////////////////////////////////////////
// mac, mult, sub, add, adj
///////////////////////////////////////////////
// FIXME -- alias this to an inline MAC struct.
friend inline void mac(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ a,
const Grid_simd *__restrict__ x) {
*y = (*a) * (*x) + (*y);
};
friend inline void mult(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ l,
const Grid_simd *__restrict__ r) {
*y = (*l) * (*r);
}
friend inline void sub(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ l,
const Grid_simd *__restrict__ r) {
*y = (*l) - (*r);
}
friend inline void add(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ l,
const Grid_simd *__restrict__ r) {
*y = (*l) + (*r);
}
friend inline void mac(Grid_simd *__restrict__ y,
const Scalar_type *__restrict__ a,
const Grid_simd *__restrict__ x) {
*y = (*a) * (*x) + (*y);
};
friend inline void mult(Grid_simd *__restrict__ y,
const Scalar_type *__restrict__ l,
const Grid_simd *__restrict__ r) {
*y = (*l) * (*r);
}
friend inline void sub(Grid_simd *__restrict__ y,
const Scalar_type *__restrict__ l,
const Grid_simd *__restrict__ r) {
*y = (*l) - (*r);
}
friend inline void add(Grid_simd *__restrict__ y,
const Scalar_type *__restrict__ l,
const Grid_simd *__restrict__ r) {
*y = (*l) + (*r);
}
friend inline void mac(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ a,
const Scalar_type *__restrict__ x) {
*y = (*a) * (*x) + (*y);
};
friend inline void mult(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ l,
const Scalar_type *__restrict__ r) {
*y = (*l) * (*r);
}
friend inline void sub(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ l,
const Scalar_type *__restrict__ r) {
*y = (*l) - (*r);
}
friend inline void add(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ l,
const Scalar_type *__restrict__ r) {
*y = (*l) + (*r);
}
////////////////////////////////////////////////////////////////////////
// FIXME: gonna remove these load/store, get, set, prefetch
////////////////////////////////////////////////////////////////////////
friend inline void vset(Grid_simd &ret, Scalar_type *a) {
ret.v = unary<Vector_type>(a, VsetSIMD());
}
///////////////////////
// Vstore
///////////////////////
friend inline void vstore(const Grid_simd &ret, Scalar_type *a) {
binary<void>(ret.v, (Real *)a, VstoreSIMD());
}
///////////////////////
// Vprefetch
///////////////////////
friend inline void vprefetch(const Grid_simd &v) {
prefetch_HINT_T0((const char *)&v.v);
}
///////////////////////
// Reduce
///////////////////////
friend inline Scalar_type Reduce(const Grid_simd &in) {
return unary<Scalar_type>(in.v, ReduceSIMD<Scalar_type, Vector_type>());
}
////////////////////////////
// operator scalar * simd
////////////////////////////
friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
Grid_simd va;
vsplat(va, a);
return va * b;
}
friend inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) {
return a * b;
}
//////////////////////////////////
// Divides
//////////////////////////////////
friend inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) {
Grid_simd va;
vsplat(va, a);
return va / b;
}
friend inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) {
Grid_simd va;
vsplat(va, a);
return b / a;
}
///////////////////////
// Unary negation
///////////////////////
friend inline Grid_simd operator-(const Grid_simd &r) {
Grid_simd ret;
vzero(ret);
ret = ret - r;
return ret;
}
// *=,+=,-= operators
inline Grid_simd &operator*=(const Grid_simd &r) {
*this = (*this) * r;
return *this;
// return (*this)*r; ?
}
inline Grid_simd &operator+=(const Grid_simd &r) {
*this = *this + r;
return *this;
}
inline Grid_simd &operator-=(const Grid_simd &r) {
*this = *this - r;
return *this;
}
///////////////////////////////////////
// Not all functions are supported
// through SIMD and must breakout to
// scalar type and back again. This
// provides support
///////////////////////////////////////
template <class functor>
friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
Grid_simd ret;
Grid_simd::conv_t conv;
Grid_simd::scalar_type s;
conv.v = v.v;
for (int i = 0; i < Nsimd(); i++) {
s = conv.s[i];
conv.s[i] = func(s);
}
ret.v = conv.v;
return ret;
}
template <class functor>
friend inline Grid_simd SimdApplyBinop(const functor &func,
const Grid_simd &x,
const Grid_simd &y) {
Grid_simd ret;
Grid_simd::conv_t cx;
Grid_simd::conv_t cy;
Grid_simd::scalar_type sx,sy;
cx.v = x.v;
cy.v = y.v;
for (int i = 0; i < Nsimd(); i++) {
sx = cx.s[i];
sy = cy.s[i];
cx.s[i] = func(sx,sy);
}
ret.v = cx.v;
return ret;
}
///////////////////////
// Exchange
// Al Ah , Bl Bh -> Al Bl Ah,Bh
///////////////////////
friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
{
if (n==3) {
Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
} else if(n==2) {
Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
} else if(n==1) {
Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
} else if(n==0) {
Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
}
}
friend inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){
Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
}
friend inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){
Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
}
friend inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){
Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
}
friend inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){
Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
}
////////////////////////////////////////////////////////////////////
// General permute; assumes vector length is same across
// all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example
////////////////////////////////////////////////////////////////////
friend inline void permute0(Grid_simd &y, Grid_simd b) {
y.v = Optimization::Permute::Permute0(b.v);
}
friend inline void permute1(Grid_simd &y, Grid_simd b) {
y.v = Optimization::Permute::Permute1(b.v);
}
friend inline void permute2(Grid_simd &y, Grid_simd b) {
y.v = Optimization::Permute::Permute2(b.v);
}
friend inline void permute3(Grid_simd &y, Grid_simd b) {
y.v = Optimization::Permute::Permute3(b.v);
}
friend inline void permute(Grid_simd &y, Grid_simd b, int perm) {
if (perm & RotateBit) {
int dist = perm & 0xF;
y = rotate(b, dist);
return;
}
else if(perm==3) permute3(y, b);
else if(perm==2) permute2(y, b);
else if(perm==1) permute1(y, b);
else if(perm==0) permute0(y, b);
}
///////////////////////////////
// Getting single lanes
///////////////////////////////
inline Scalar_type getlane(int lane) {
return ((Scalar_type*)&v)[lane];
}
inline void putlane(const Scalar_type &S, int lane){
((Scalar_type*)&v)[lane] = S;
}
}; // end of Grid_simd class definition
inline void permute(ComplexD &y,ComplexD b, int perm) { y=b; }
inline void permute(ComplexF &y,ComplexF b, int perm) { y=b; }
inline void permute(RealD &y,RealD b, int perm) { y=b; }
inline void permute(RealF &y,RealF b, int perm) { y=b; }
////////////////////////////////////////////////////////////////////
// General rotate
////////////////////////////////////////////////////////////////////
template <class S, class V, IfNotComplex<S> = 0>
inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
nrot = nrot % Grid_simd<S, V>::Nsimd();
Grid_simd<S, V> ret;
ret.v = Optimization::Rotate::rotate(b.v, nrot);
return ret;
}
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
nrot = nrot % Grid_simd<S, V>::Nsimd();
Grid_simd<S, V> ret;
ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
return ret;
}
template <class S, class V, IfNotComplex<S> =0>
inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
{
nrot = nrot % Grid_simd<S,V>::Nsimd();
ret.v = Optimization::Rotate::rotate(b.v,nrot);
}
template <class S, class V, IfComplex<S> =0>
inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
{
nrot = nrot % Grid_simd<S,V>::Nsimd();
ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
}
template <class S, class V>
inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
S* typepun =(S*) &src;
vsplat(ret,typepun[lane]);
}
template <class S, class V, IfComplex<S> =0>
inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
S* typepun =(S*) &src;
ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
}
///////////////////////
// Splat
///////////////////////
// this is only for the complex version
template <class S, class V, IfComplex<S> = 0, class ABtype>
inline void vsplat(Grid_simd<S, V> &ret, ABtype a, ABtype b) {
ret.v = binary<V>(a, b, VsplatSIMD());
}
// overload if complex
template <class S, class V>
inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
vsplat(ret, real(c), imag(c));
}
template <class S, class V>
inline void rsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
vsplat(ret, real(c), real(c));
}
// if real fill with a, if complex fill with a in the real part (first function
// above)
template <class S, class V>
inline void vsplat(Grid_simd<S, V> &ret, NotEnableIf<is_complex<S>, S> a) {
ret.v = unary<V>(a, VsplatSIMD());
}
//////////////////////////
///////////////////////////////////////////////
// Initialise to 1,0,i for the correct types
///////////////////////////////////////////////
// For complex types
template <class S, class V, IfComplex<S> = 0>
inline void vone(Grid_simd<S, V> &ret) {
vsplat(ret, S(1.0, 0.0));
}
template <class S, class V, IfComplex<S> = 0>
inline void vzero(Grid_simd<S, V> &ret) {
vsplat(ret, S(0.0, 0.0));
} // use xor?
template <class S, class V, IfComplex<S> = 0>
inline void vcomplex_i(Grid_simd<S, V> &ret) {
vsplat(ret, S(0.0, 1.0));
}
template <class S, class V, IfComplex<S> = 0>
inline void visign(Grid_simd<S, V> &ret) {
vsplat(ret, S(1.0, -1.0));
}
template <class S, class V, IfComplex<S> = 0>
inline void vrsign(Grid_simd<S, V> &ret) {
vsplat(ret, S(-1.0, 1.0));
}
// if not complex overload here
template <class S, class V, IfReal<S> = 0>
inline void vone(Grid_simd<S, V> &ret) {
vsplat(ret, S(1.0));
}
template <class S, class V, IfReal<S> = 0>
inline void vzero(Grid_simd<S, V> &ret) {
vsplat(ret, S(0.0));
}
// For integral types
template <class S, class V, IfInteger<S> = 0>
inline void vone(Grid_simd<S, V> &ret) {
vsplat(ret, 1);
}
template <class S, class V, IfInteger<S> = 0>
inline void vzero(Grid_simd<S, V> &ret) {
vsplat(ret, 0);
}
template <class S, class V, IfInteger<S> = 0>
inline void vtrue(Grid_simd<S, V> &ret) {
vsplat(ret, 0xFFFFFFFF);
}
template <class S, class V, IfInteger<S> = 0>
inline void vfalse(Grid_simd<S, V> &ret) {
vsplat(ret, 0);
}
template <class S, class V>
inline void zeroit(Grid_simd<S, V> &z) {
vzero(z);
}
///////////////////////
// Vstream
///////////////////////
template <class S, class V, IfReal<S> = 0>
inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
binary<void>((S *)&out.v, in.v, VstreamSIMD());
}
template <class S, class V, IfComplex<S> = 0>
inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
typedef typename S::value_type T;
binary<void>((T *)&out.v, in.v, VstreamSIMD());
}
template <class S, class V, IfInteger<S> = 0>
inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
out = in;
}
////////////////////////////////////
// Arithmetic operator overloads +,-,*
////////////////////////////////////
template <class S, class V>
inline Grid_simd<S, V> operator+(Grid_simd<S, V> a, Grid_simd<S, V> b) {
Grid_simd<S, V> ret;
ret.v = binary<V>(a.v, b.v, SumSIMD());
return ret;
};
template <class S, class V>
inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
Grid_simd<S, V> ret;
ret.v = binary<V>(a.v, b.v, SubSIMD());
return ret;
};
// Distinguish between complex types and others
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
Grid_simd<S, V> ret;
ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
return ret;
};
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
Grid_simd<S, V> ret;
ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
return ret;
};
// Distinguish between complex types and others
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
Grid_simd<S, V> ret;
ret.v = binary<V>(a.v, b.v, MultComplexSIMD());
return ret;
};
// Real/Integer types
template <class S, class V, IfNotComplex<S> = 0>
inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
Grid_simd<S, V> ret;
ret.v = binary<V>(a.v, b.v, MultSIMD());
return ret;
};
// Distinguish between complex types and others
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
typedef Grid_simd<S, V> simd;
simd ret;
simd den;
typename simd::conv_t conv;
ret = a * conjugate(b) ;
den = b * conjugate(b) ;
auto real_den = toReal(den);
ret.v=binary<V>(ret.v, real_den.v, DivSIMD());
return ret;
};
// Real/Integer types
template <class S, class V, IfNotComplex<S> = 0>
inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
Grid_simd<S, V> ret;
ret.v = binary<V>(a.v, b.v, DivSIMD());
return ret;
};
///////////////////////
// Conjugate
///////////////////////
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
Grid_simd<S, V> ret;
ret.v = unary<V>(in.v, ConjSIMD());
return ret;
}
template <class S, class V, IfNotComplex<S> = 0>
inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
return in; // for real objects
}
// Suppress adj for integer types... // odd; why conjugate above but not adj??
template <class S, class V, IfNotInteger<S> = 0>
inline Grid_simd<S, V> adj(const Grid_simd<S, V> &in) {
return conjugate(in);
}
///////////////////////
// timesMinusI
///////////////////////
template <class S, class V, IfComplex<S> = 0>
inline void timesMinusI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
ret.v = binary<V>(in.v, ret.v, TimesMinusISIMD());
}
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
Grid_simd<S, V> ret;
timesMinusI(ret, in);
return ret;
}
template <class S, class V, IfNotComplex<S> = 0>
inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
return in;
}
///////////////////////
// timesI
///////////////////////
template <class S, class V, IfComplex<S> = 0>
inline void timesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
ret.v = binary<V>(in.v, ret.v, TimesISIMD());
}
template <class S, class V, IfComplex<S> = 0>
inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
Grid_simd<S, V> ret;
timesI(ret, in);
return ret;
}
template <class S, class V, IfNotComplex<S> = 0>
inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
return in;
}
/////////////////////
// Inner, outer
/////////////////////
template <class S, class V>
inline Grid_simd<S, V> innerProduct(const Grid_simd<S, V> &l,
const Grid_simd<S, V> &r) {
return conjugate(l) * r;
}
template <class S, class V>
inline Grid_simd<S, V> outerProduct(const Grid_simd<S, V> &l,
const Grid_simd<S, V> &r) {
return l * conjugate(r);
}
template <class S, class V>
inline Grid_simd<S, V> trace(const Grid_simd<S, V> &arg) {
return arg;
}
////////////////////////////////////////////////////////////
// copy/splat complex real parts into real;
// insert real into complex and zero imag;
////////////////////////////////////////////////////////////
// real = toReal( complex )
template <class S, class V, IfReal<S> = 0>
inline Grid_simd<S, V> toReal(const Grid_simd<std::complex<S>, V> &in) {
typedef Grid_simd<S, V> simd;
simd ret;
typename simd::conv_t conv;
conv.v = in.v; // copy the vector content (bytewise)
for (int i = 0; i < simd::Nsimd(); i += 2) {
conv.s[i + 1] = conv.s[i]; // duplicate (r,r);(r,r);(r,r); etc...
}
ret.v = conv.v;
return ret;
}
// complex = toComplex( real )
template <class R, class V, IfReal<R> = 0> // must be a real arg
inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
typedef Grid_simd<R, V> Rsimd;
typedef Grid_simd<std::complex<R>, V> Csimd;
typename Rsimd::conv_t conv; // address as real
conv.v = in.v;
for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
assert(conv.s[i + 1] == conv.s[i]);
// trap any cases where real was not duplicated
// indicating the SIMD grids of real and imag assignment did not correctly
// match
conv.s[i + 1] = 0.0; // zero imaginary parts
}
Csimd ret;
ret.v = conv.v;
return ret;
}
///////////////////////////////
// Define available types
///////////////////////////////
typedef Grid_simd<float, SIMD_Ftype> vRealF;
typedef Grid_simd<double, SIMD_Dtype> vRealD;
typedef Grid_simd<std::complex<float>, SIMD_Ftype> vComplexF;
typedef Grid_simd<std::complex<double>, SIMD_Dtype> vComplexD;
typedef Grid_simd<Integer, SIMD_Itype> vInteger;
// Half precision; no arithmetic support
typedef Grid_simd<uint16_t, SIMD_Htype> vRealH;
typedef Grid_simd<std::complex<uint16_t>, SIMD_Htype> vComplexH;
inline void precisionChange(vRealF *out,vRealD *in,int nvec)
{
assert((nvec&0x1)==0);
for(int m=0;m*2<nvec;m++){
int n=m*2;
out[m].v=Optimization::PrecisionChange::DtoS(in[n].v,in[n+1].v);
}
}
inline void precisionChange(vRealH *out,vRealD *in,int nvec)
{
assert((nvec&0x3)==0);
for(int m=0;m*4<nvec;m++){
int n=m*4;
out[m].v=Optimization::PrecisionChange::DtoH(in[n].v,in[n+1].v,in[n+2].v,in[n+3].v);
}
}
inline void precisionChange(vRealH *out,vRealF *in,int nvec)
{
assert((nvec&0x1)==0);
for(int m=0;m*2<nvec;m++){
int n=m*2;
out[m].v=Optimization::PrecisionChange::StoH(in[n].v,in[n+1].v);
}
}
inline void precisionChange(vRealD *out,vRealF *in,int nvec)
{
assert((nvec&0x1)==0);
for(int m=0;m*2<nvec;m++){
int n=m*2;
Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v);
}
}
inline void precisionChange(vRealD *out,vRealH *in,int nvec)
{
assert((nvec&0x3)==0);
for(int m=0;m*4<nvec;m++){
int n=m*4;
Optimization::PrecisionChange::HtoD(in[m].v,out[n].v,out[n+1].v,out[n+2].v,out[n+3].v);
}
}
inline void precisionChange(vRealF *out,vRealH *in,int nvec)
{
assert((nvec&0x1)==0);
for(int m=0;m*2<nvec;m++){
int n=m*2;
Optimization::PrecisionChange::HtoS(in[m].v,out[n].v,out[n+1].v);
}
}
inline void precisionChange(vComplexF *out,vComplexD *in,int nvec){ precisionChange((vRealF *)out,(vRealD *)in,nvec);}
inline void precisionChange(vComplexH *out,vComplexD *in,int nvec){ precisionChange((vRealH *)out,(vRealD *)in,nvec);}
inline void precisionChange(vComplexH *out,vComplexF *in,int nvec){ precisionChange((vRealH *)out,(vRealF *)in,nvec);}
inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionChange((vRealD *)out,(vRealF *)in,nvec);}
inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
// Check our vector types are of an appropriate size.
#if defined QPX
static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
#else
static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
#endif
/////////////////////////////////////////
// Some traits to recognise the types
/////////////////////////////////////////
template <typename T>
struct is_simd : public std::false_type {};
template <> struct is_simd<vRealF> : public std::true_type {};
template <> struct is_simd<vRealD> : public std::true_type {};
template <> struct is_simd<vComplexF> : public std::true_type {};
template <> struct is_simd<vComplexD> : public std::true_type {};
template <> struct is_simd<vInteger> : public std::true_type {};
template <typename T> using IfSimd = Invoke<std::enable_if<is_simd<T>::value, int> >;
template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
}
#endif

View File

@ -0,0 +1,223 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_vector_unops.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_VECTOR_UNOPS
#define GRID_VECTOR_UNOPS
#include <cmath>
namespace Grid {
template <class scalar>
struct SqrtRealFunctor {
scalar operator()(const scalar &a) const { return sqrt(real(a)); }
};
template <class scalar>
struct RSqrtRealFunctor {
scalar operator()(const scalar &a) const {
return scalar(1.0 / sqrt(real(a)));
}
};
template <class scalar>
struct CosRealFunctor {
scalar operator()(const scalar &a) const { return cos(real(a)); }
};
template <class scalar>
struct SinRealFunctor {
scalar operator()(const scalar &a) const { return sin(real(a)); }
};
template <class scalar>
struct AcosRealFunctor {
scalar operator()(const scalar &a) const { return acos(real(a)); }
};
template <class scalar>
struct AsinRealFunctor {
scalar operator()(const scalar &a) const { return asin(real(a)); }
};
template <class scalar>
struct LogRealFunctor {
scalar operator()(const scalar &a) const { return log(real(a)); }
};
template <class scalar>
struct ExpFunctor {
scalar operator()(const scalar &a) const { return exp(a); }
};
template <class scalar>
struct NotFunctor {
scalar operator()(const scalar &a) const { return (!a); }
};
template <class scalar>
struct AbsRealFunctor {
scalar operator()(const scalar &a) const { return std::abs(real(a)); }
};
template <class scalar>
struct PowRealFunctor {
double y;
PowRealFunctor(double _y) : y(_y){};
scalar operator()(const scalar &a) const { return pow(real(a), y); }
};
template <class scalar>
struct ModIntFunctor {
Integer y;
ModIntFunctor(Integer _y) : y(_y){};
scalar operator()(const scalar &a) const { return Integer(a) % y; }
};
template <class scalar>
struct DivIntFunctor {
Integer y;
DivIntFunctor(Integer _y) : y(_y){};
scalar operator()(const scalar &a) const { return Integer(a) / y; }
};
template <class scalar>
struct RealFunctor {
scalar operator()(const scalar &a) const { return std::real(a); }
};
template <class scalar>
struct ImagFunctor {
scalar operator()(const scalar &a) const { return std::imag(a); }
};
template <class S, class V>
inline Grid_simd<S, V> real(const Grid_simd<S, V> &r) {
return SimdApply(RealFunctor<S>(), r);
}
template <class S, class V>
inline Grid_simd<S, V> imag(const Grid_simd<S, V> &r) {
return SimdApply(ImagFunctor<S>(), r);
}
template <class S, class V>
inline Grid_simd<S, V> sqrt(const Grid_simd<S, V> &r) {
return SimdApply(SqrtRealFunctor<S>(), r);
}
template <class S, class V>
inline Grid_simd<S, V> rsqrt(const Grid_simd<S, V> &r) {
return SimdApply(RSqrtRealFunctor<S>(), r);
}
template <class Scalar>
inline Scalar rsqrt(const Scalar &r) {
return (RSqrtRealFunctor<Scalar>(), r);
}
template <class S, class V>
inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
return SimdApply(CosRealFunctor<S>(), r);
}
template <class S, class V>
inline Grid_simd<S, V> sin(const Grid_simd<S, V> &r) {
return SimdApply(SinRealFunctor<S>(), r);
}
template <class S, class V>
inline Grid_simd<S, V> acos(const Grid_simd<S, V> &r) {
return SimdApply(AcosRealFunctor<S>(), r);
}
template <class S, class V>
inline Grid_simd<S, V> asin(const Grid_simd<S, V> &r) {
return SimdApply(AsinRealFunctor<S>(), r);
}
template <class S, class V>
inline Grid_simd<S, V> log(const Grid_simd<S, V> &r) {
return SimdApply(LogRealFunctor<S>(), r);
}
template <class S, class V>
inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
return SimdApply(AbsRealFunctor<S>(), r);
}
template <class S, class V>
inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
return SimdApply(ExpFunctor<S>(), r);
}
template <class S, class V>
inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {
return SimdApply(NotFunctor<S>(), r);
}
template <class S, class V>
inline Grid_simd<S, V> pow(const Grid_simd<S, V> &r, double y) {
return SimdApply(PowRealFunctor<S>(y), r);
}
template <class S, class V>
inline Grid_simd<S, V> mod(const Grid_simd<S, V> &r, Integer y) {
return SimdApply(ModIntFunctor<S>(y), r);
}
template <class S, class V>
inline Grid_simd<S, V> div(const Grid_simd<S, V> &r, Integer y) {
return SimdApply(DivIntFunctor<S>(y), r);
}
////////////////////////////////////////////////////////////////////////////
// Allows us to assign into **conformable** real vectors from complex
////////////////////////////////////////////////////////////////////////////
template <class scalar>
struct AndFunctor {
scalar operator()(const scalar &x, const scalar &y) const { return x & y; }
};
template <class scalar>
struct OrFunctor {
scalar operator()(const scalar &x, const scalar &y) const { return x | y; }
};
template <class scalar>
struct AndAndFunctor {
scalar operator()(const scalar &x, const scalar &y) const { return x && y; }
};
template <class scalar>
struct OrOrFunctor {
scalar operator()(const scalar &x, const scalar &y) const { return x || y; }
};
////////////////////////////////
// Calls to simd binop functors
////////////////////////////////
template <class S, class V>
inline Grid_simd<S, V> operator&(const Grid_simd<S, V> &x,
const Grid_simd<S, V> &y) {
return SimdApplyBinop(AndFunctor<S>(), x, y);
}
template <class S, class V>
inline Grid_simd<S, V> operator&&(const Grid_simd<S, V> &x,
const Grid_simd<S, V> &y) {
return SimdApplyBinop(AndAndFunctor<S>(), x, y);
}
template <class S, class V>
inline Grid_simd<S, V> operator|(const Grid_simd<S, V> &x,
const Grid_simd<S, V> &y) {
return SimdApplyBinop(OrFunctor<S>(), x, y);
}
template <class S, class V>
inline Grid_simd<S, V> operator||(const Grid_simd<S, V> &x,
const Grid_simd<S, V> &y) {
return SimdApplyBinop(OrOrFunctor<S>(), x, y);
}
}
#endif

598
Grid/simd/IBM_qpx.h Normal file
View File

@ -0,0 +1,598 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/BGQQPX.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ASM_BGQ_QPX_H
#define GRID_ASM_BGQ_QPX_H
#include <stdint.h>
/*********************************************************
* Register definitions
*********************************************************/
#define psi_00 0
#define psi_01 1
#define psi_02 2
#define psi_10 3
#define psi_11 4
#define psi_12 5
#define psi_20 6
#define psi_21 7
#define psi_22 8
#define psi_30 9
#define psi_31 10
#define psi_32 11
#define Chi_00 12
#define Chi_01 13
#define Chi_02 14
#define Chi_10 15
#define Chi_11 16
#define Chi_12 17
#define UChi_00 18
#define UChi_01 19
#define UChi_02 20
#define UChi_10 21
#define UChi_11 22
#define UChi_12 23
#define U0 24
#define U1 25
#define U2 26
#define one 27
#define perm_reg 28
#define REP %%r16
#define IMM %%r17
#define pREP %r16
#define pIMM %r17
#define PPC_INST_DCBTLS 0x7c00014c
#define PPC_INST_DCBLC 0x7c00030c
#define __PPC_CT(t) (((t) & 0x0f) << 21)
#define ___PPC_RA(a) (((a) & 0x1f) << 16)
#define ___PPC_RB(b) (((b) & 0x1f) << 11)
#define LOCK_SET ".long (" HASH(PPC_INST_DCBTLS) "|" HASH(___PPC_RB(16)) ")\n"
#define LOCK_CLEAR ".long (" HASH(PPC_INST_DCBLC) "|" HASH(___PPC_RB(16)) ")\n"
/*Alias regs for incoming fourspinor on neighbour site*/
#define Chi_20 UChi_00
#define Chi_21 UChi_01
#define Chi_22 UChi_02
#define Chi_30 UChi_10
#define Chi_31 UChi_11
#define Chi_32 UChi_12
/*********************************************************
* Architectural macros
*********************************************************/
#define HASHit(A) #A
#define HASH(A) HASHit(A)
#define LOAD64(A,ptr)
#define MASK_REGS /*NOOP ON BGQ*/
#define PF_GAUGE(A) /*NOOP ON BGQ*/
#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
#define PREFETCH_CHIMU(base) /*NOOP ON BGQ*/
#define VLOADf(OFF,PTR,DEST) "qvlfsx " #DEST "," #PTR "," #OFF " ;\n"
#define VLOADuf(OFF,PTR,DEST) "qvlfsux " #DEST "," #PTR "," #OFF " ;\n"
#define VSTOREf(OFF,PTR,SRC) "qvstfsx " #SRC "," #PTR "," #OFF " ;\n"
#define VSTOREuf(OFF,PTR,SRC) "qvstfsux " #SRC "," #PTR "," #OFF " ;\n"
#define VSPLATf(A,B,DEST) "qvlfcsxa " #DEST "," #A "," #B ";\n"
#define VSIZEf (16)
#define VPERMIi(p) "qvgpci " #p ", 1217;\n"
#define VPERMi(A,p) "qvfperm " #A "," #A "," #A "," #p ";\n"
#define VPERMI(p) VPERMIi(p)
#define VPERM(A,p) VPERMi(A,p)
#define VLOADd(OFF,PTR,DEST) "qvlfdx " #DEST "," #PTR "," #OFF " ;\n"
#define VLOADud(OFF,PTR,DEST) "qvlfdux " #DEST "," #PTR "," #OFF " ;\n"
#define VSTOREd(OFF,PTR,SRC) "qvstfdx " #SRC "," #PTR "," #OFF " ;\n"
#define VSTOREud(OFF,PTR,SRC) "qvstfdux " #SRC "," #PTR "," #OFF " ;\n"
#define VSPLATd(A,B,DEST) "qvlfcdxa " #DEST "," #A "," #B ";\n"
#define VSIZEd (32)
// QPX manual ordering QRT comes first (dest)
#define VZEROi(DEST) "qvfset " #DEST "; \n qvfsub " #DEST "," #DEST "," #DEST ";\n"
#define VONEi(DEST) "qvfset " #DEST "; \n"
#define VMOVi(DEST,A) "qvfmr " #DEST "," #A ";\n"
#define VADDi(DEST,A,B) "qvfadd " #DEST "," #A "," #B ";\n"
#define VSUBi(DEST,A,B) "qvfsub " #DEST "," #A "," #B ";\n"
#define VMULi(DEST,A,B) "qvfmul " #DEST "," #A "," #B ";\n"
#define VMUL_RR_RIi(DEST,A,B) "qvfxmul " #DEST "," #A "," #B ";\n"
#define VMADDi(DEST,A,B,C) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
#define VMADD_RR_RIi(DEST,A,B,C) "qvfxmadd " #DEST "," #A "," #B ","#C ";\n"
#define VMADD_MII_IRi(DEST,A,B,C) "qvfxxnpmadd " #DEST "," #B "," #A ","#C ";\n"
#define VMADD_II_MIRi(DEST,A,B,C) "qvfxxcpnmadd " #DEST "," #B "," #A ","#C ";\n"
#define VZERO(C) VZEROi(C)
#define VONE(C) VONEi(C)
#define VMOV(C,A) VMOVi(C,A)
#define VADD(A,B,C) VADDi(A,B,C)
#define VSUB(A,B,C) VSUBi(A,B,C)
#define VMUL(A,B,C) VMULi(A,B,C)
#define VMUL_RR_RI(A,B,C) VMUL_RR_RIi(A,B,C)
#define VMADD(A,B,C,D) VMADDi(A,B,C,D)
#define VMADD_RR_RI(A,B,C,D) VMADD_RR_RIi(A,B,C,D)
#define VMADD_MII_IR(A,B,C,D) VMADD_MII_IRi(A,B,C,D)
#define VMADD_II_MIR(A,B,C,D) VMADD_II_MIRi(A,B,C,D)
/*********************************************************
* Macro sequences encoding QCD
*********************************************************/
#define LOCK_GAUGE(dir) \
{ \
uint64_t byte_addr = (uint64_t)&U._odata[sU]; \
int count = (sizeof(U._odata[0])+63)/64; \
asm (" mtctr %0 \n" \
" mr " HASH(REP) ", %1\n" \
" li " HASH(IMM) ", 64\n" \
"0:\n" \
LOCK_SET \
" add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \
" bdnz 0b\n" \
: : "b" (count), "b" (byte_addr) ); \
}
#define UNLOCK_GAUGE(dir) \
{ \
uint64_t byte_addr = (uint64_t)&U._odata[sU]; \
int count = (sizeof(U._odata[0])+63)/64; \
asm (" mtctr %0 \n" \
" mr " HASH(REP) ", %1\n" \
" li " HASH(IMM) ", 64\n" \
"0:\n" \
LOCK_CLEAR \
" add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \
" bdnz 0b\n" \
: : "b" (count), "b" (byte_addr) ); \
}
#define ZERO_PSI \
VZERO(psi_00) \
VZERO(psi_01) \
VZERO(psi_02) \
VZERO(psi_10) \
VZERO(psi_11) \
VZERO(psi_12) \
VZERO(psi_20) \
VZERO(psi_21) \
VZERO(psi_22) \
VZERO(psi_30) \
VZERO(psi_31) \
VZERO(psi_32)
#define MULT_2SPIN_QPX_LSd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,16)
#define MULT_2SPIN_QPX_LSf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,8)
#define MULT_2SPIN_QPXd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,32)
#define MULT_2SPIN_QPXf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,16)
#define MULT_2SPIN_QPX_INTERNAL(ptr,p,ULOAD,USKIP) { \
uint64_t ub = ((uint64_t)ptr); \
asm ( \
ULOAD(%0,%3,U0) \
ULOAD(%1,%3,U1) \
ULOAD(%2,%3,U2) \
VMUL_RR_RI(UChi_00,U0,Chi_00) \
VMUL_RR_RI(UChi_01,U1,Chi_00) \
VMUL_RR_RI(UChi_02,U2,Chi_00) \
VMUL_RR_RI(UChi_10,U0,Chi_10) \
VMUL_RR_RI(UChi_11,U1,Chi_10) \
VMUL_RR_RI(UChi_12,U2,Chi_10) \
VMADD_MII_IR(UChi_00,U0,Chi_00,UChi_00) \
VMADD_MII_IR(UChi_01,U1,Chi_00,UChi_01) \
VMADD_MII_IR(UChi_02,U2,Chi_00,UChi_02) \
VMADD_MII_IR(UChi_10,U0,Chi_10,UChi_10) \
VMADD_MII_IR(UChi_11,U1,Chi_10,UChi_11) \
VMADD_MII_IR(UChi_12,U2,Chi_10,UChi_12) \
: : "b" (0), "b" (USKIP*3), "b" (USKIP*6), "b" (ub )); \
asm ( \
ULOAD(%0,%3,U0) \
ULOAD(%1,%3,U1) \
ULOAD(%2,%3,U2) \
VMADD_RR_RI(UChi_00,U0,Chi_01,UChi_00) \
VMADD_RR_RI(UChi_01,U1,Chi_01,UChi_01) \
VMADD_RR_RI(UChi_02,U2,Chi_01,UChi_02) \
VMADD_RR_RI(UChi_10,U0,Chi_11,UChi_10) \
VMADD_RR_RI(UChi_11,U1,Chi_11,UChi_11) \
VMADD_RR_RI(UChi_12,U2,Chi_11,UChi_12) \
VMADD_MII_IR(UChi_00,U0,Chi_01,UChi_00) \
VMADD_MII_IR(UChi_01,U1,Chi_01,UChi_01) \
VMADD_MII_IR(UChi_02,U2,Chi_01,UChi_02) \
VMADD_MII_IR(UChi_10,U0,Chi_11,UChi_10) \
VMADD_MII_IR(UChi_11,U1,Chi_11,UChi_11) \
VMADD_MII_IR(UChi_12,U2,Chi_11,UChi_12) \
: : "b" (USKIP*1), "b" (USKIP*4), "b" (USKIP*7), "b" (ub )); \
asm ( \
ULOAD(%0,%3,U0) \
ULOAD(%1,%3,U1) \
ULOAD(%2,%3,U2) \
VMADD_RR_RI(UChi_00,U0,Chi_02,UChi_00) \
VMADD_RR_RI(UChi_01,U1,Chi_02,UChi_01) \
VMADD_RR_RI(UChi_02,U2,Chi_02,UChi_02) \
VMADD_RR_RI(UChi_10,U0,Chi_12,UChi_10) \
VMADD_RR_RI(UChi_11,U1,Chi_12,UChi_11) \
VMADD_RR_RI(UChi_12,U2,Chi_12,UChi_12) \
VMADD_MII_IR(UChi_00,U0,Chi_02,UChi_00) \
VMADD_MII_IR(UChi_01,U1,Chi_02,UChi_01) \
VMADD_MII_IR(UChi_02,U2,Chi_02,UChi_02) \
VMADD_MII_IR(UChi_10,U0,Chi_12,UChi_10) \
VMADD_MII_IR(UChi_11,U1,Chi_12,UChi_11) \
VMADD_MII_IR(UChi_12,U2,Chi_12,UChi_12) \
: : "b" (USKIP*2), "b" (USKIP*5), "b" (USKIP*8), "b" (ub )); \
}
#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p)
#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
#define SAVE_RESULT(base,basep) {\
uint64_t ub = ((uint64_t)base) - (VSIZE); \
asm("mr " HASH(REP) ", %0;\n" \
"li " HASH(IMM) "," HASH(VSIZE)" ;\n" \
VSTOREu(IMM,REP,psi_00) \
VSTOREu(IMM,REP,psi_01) \
VSTOREu(IMM,REP,psi_02) \
VSTOREu(IMM,REP,psi_10) \
VSTOREu(IMM,REP,psi_11) \
VSTOREu(IMM,REP,psi_12) \
VSTOREu(IMM,REP,psi_20) \
VSTOREu(IMM,REP,psi_21) \
VSTOREu(IMM,REP,psi_22) \
VSTOREu(IMM,REP,psi_30) \
VSTOREu(IMM,REP,psi_31) \
VSTOREu(IMM,REP,psi_32) \
: : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
}
/*
*Annoying BG/Q loads with no immediat indexing and big performance hit
*when second miss to a L1 line occurs
*/
#define LOAD_CHI(base) { \
uint64_t ub = ((uint64_t)base) - (2*VSIZE); \
asm("mr " HASH(REP) ",%0 ;\n" \
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
VLOADu(IMM,REP,Chi_00) \
VLOADu(IMM,REP,Chi_02) \
VLOADu(IMM,REP,Chi_11) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
ub = ((uint64_t)base) - VSIZE; \
asm("mr " HASH(REP) ", %0;\n" \
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
VLOADu(IMM,REP,Chi_01) \
VLOADu(IMM,REP,Chi_10) \
VLOADu(IMM,REP,Chi_12) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
}
#define LOAD_CHIMU(base) { \
uint64_t ub = ((uint64_t)base) - (2*VSIZE); \
asm("mr " HASH(REP) ",%0;\n" \
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
VLOADu(IMM,REP,Chi_00) \
VLOADu(IMM,REP,Chi_02) \
VLOADu(IMM,REP,Chi_11) \
VLOADu(IMM,REP,Chi_20) \
VLOADu(IMM,REP,Chi_22) \
VLOADu(IMM,REP,Chi_31) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
ub = ((uint64_t)base) - VSIZE; \
asm("mr " HASH(REP) ", %0;\n" \
"li " HASH(IMM) ", (2*" HASH(VSIZE) ");\n" \
VLOADu(IMM,REP,Chi_01) \
VLOADu(IMM,REP,Chi_10) \
VLOADu(IMM,REP,Chi_12) \
VLOADu(IMM,REP,Chi_21) \
VLOADu(IMM,REP,Chi_30) \
VLOADu(IMM,REP,Chi_32) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
}
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
#define XP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_MII_IR(Chi_00,one,Chi_30,Chi_00) \
VMADD_MII_IR(Chi_01,one,Chi_31,Chi_01) \
VMADD_MII_IR(Chi_02,one,Chi_32,Chi_02) \
VMADD_MII_IR(Chi_10,one,Chi_20,Chi_10) \
VMADD_MII_IR(Chi_11,one,Chi_21,Chi_11) \
VMADD_MII_IR(Chi_12,one,Chi_22,Chi_12) \
); \
}
#define XM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_II_MIR(Chi_00,one,Chi_30,Chi_00) \
VMADD_II_MIR(Chi_01,one,Chi_31,Chi_01) \
VMADD_II_MIR(Chi_02,one,Chi_32,Chi_02) \
VMADD_II_MIR(Chi_10,one,Chi_20,Chi_10) \
VMADD_II_MIR(Chi_11,one,Chi_21,Chi_11) \
VMADD_II_MIR(Chi_12,one,Chi_22,Chi_12) \
); \
}
// hspin(0)=fspin(0)-fspin(3);
// hspin(1)=fspin(1)+fspin(2);
#define YP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VSUB(Chi_00,Chi_00,Chi_30) \
VSUB(Chi_01,Chi_01,Chi_31) \
VSUB(Chi_02,Chi_02,Chi_32) \
VADD(Chi_10,Chi_10,Chi_20) \
VADD(Chi_11,Chi_11,Chi_21) \
VADD(Chi_12,Chi_12,Chi_22) \
); \
}
#define YM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VADD(Chi_00,Chi_00,Chi_30) \
VADD(Chi_01,Chi_01,Chi_31) \
VADD(Chi_02,Chi_02,Chi_32) \
VSUB(Chi_10,Chi_10,Chi_20) \
VSUB(Chi_11,Chi_11,Chi_21) \
VSUB(Chi_12,Chi_12,Chi_22) ); \
}
/*Gz
* 0 0 i 0 [0]+-i[2]
* 0 0 0 -i [1]-+i[3]
* -i 0 0 0
* 0 i 0 0
*/
#define ZP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_MII_IR(Chi_00,one,Chi_20,Chi_00) \
VMADD_MII_IR(Chi_01,one,Chi_21,Chi_01) \
VMADD_MII_IR(Chi_02,one,Chi_22,Chi_02) \
VMADD_II_MIR(Chi_10,one,Chi_30,Chi_10) \
VMADD_II_MIR(Chi_11,one,Chi_31,Chi_11) \
VMADD_II_MIR(Chi_12,one,Chi_32,Chi_12) \
); \
}
#define ZM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_II_MIR(Chi_00,one,Chi_20,Chi_00) \
VMADD_II_MIR(Chi_01,one,Chi_21,Chi_01) \
VMADD_II_MIR(Chi_02,one,Chi_22,Chi_02) \
VMADD_MII_IR(Chi_10,one,Chi_30,Chi_10) \
VMADD_MII_IR(Chi_11,one,Chi_31,Chi_11) \
VMADD_MII_IR(Chi_12,one,Chi_32,Chi_12) \
); \
}
/*Gt
* 0 0 1 0 [0]+-[2]
* 0 0 0 1 [1]+-[3]
* 1 0 0 0
* 0 1 0 0
*/
#define TP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VADD(Chi_00,Chi_00,Chi_20) \
VADD(Chi_01,Chi_01,Chi_21) \
VADD(Chi_02,Chi_02,Chi_22) \
VADD(Chi_10,Chi_10,Chi_30) \
VADD(Chi_11,Chi_11,Chi_31) \
VADD(Chi_12,Chi_12,Chi_32) \
); \
}
#define TM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VSUB(Chi_00,Chi_00,Chi_20) \
VSUB(Chi_01,Chi_01,Chi_21) \
VSUB(Chi_02,Chi_02,Chi_22) \
VSUB(Chi_10,Chi_10,Chi_30) \
VSUB(Chi_11,Chi_11,Chi_31) \
VSUB(Chi_12,Chi_12,Chi_32) \
); \
}
/*
fspin(0)=hspin(0);
fspin(1)=hspin(1);
fspin(2)=timesMinusI(hspin(1));
fspin(3)=timesMinusI(hspin(0));
fspin(0)+=hspin(0);
fspin(1)+=hspin(1);
fspin(2)-=timesI(hspin(1));
fspin(3)-=timesI(hspin(0));
*/
#define XP_RECON { \
asm(\
VONE(one)\
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \
VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \
VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \
VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \
VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \
VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \
); \
}
#define XM_RECON { \
asm(\
VONE(one)\
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \
VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \
VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \
VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \
VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \
VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \
); \
}
#define XP_RECON_ACCUM { \
asm(\
VONE(one)\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \
VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \
VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \
VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \
VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \
VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \
); \
}
#define XM_RECON_ACCUM { \
asm(\
VONE(one)\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \
VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \
VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \
VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \
VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \
VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \
); \
}
// fspin(2)+=hspin(1);
// fspin(3)-=hspin(0);
#define YP_RECON_ACCUM {\
asm(\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VADD(psi_20,psi_20,UChi_10) VADD(psi_21,psi_21,UChi_11) VADD(psi_22,psi_22,UChi_12) \
VSUB(psi_30,psi_30,UChi_00) VSUB(psi_31,psi_31,UChi_01) VSUB(psi_32,psi_32,UChi_02) \
);\
}
#define YM_RECON_ACCUM {\
asm(\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VSUB(psi_20,psi_20,UChi_10) VSUB(psi_21,psi_21,UChi_11) VSUB(psi_22,psi_22,UChi_12) \
VADD(psi_30,psi_30,UChi_00) VADD(psi_31,psi_31,UChi_01) VADD(psi_32,psi_32,UChi_02) \
);\
}
// fspin(2)-=timesI(hspin(0));
// fspin(3)+=timesI(hspin(1));
#define ZP_RECON_ACCUM {\
asm(\
VONE(one)\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VMADD_II_MIR(psi_20,one,UChi_00,psi_20) \
VMADD_II_MIR(psi_21,one,UChi_01,psi_21) \
VMADD_II_MIR(psi_22,one,UChi_02,psi_22) \
VMADD_MII_IR(psi_30,one,UChi_10,psi_30) \
VMADD_MII_IR(psi_31,one,UChi_11,psi_31) \
VMADD_MII_IR(psi_32,one,UChi_12,psi_32) \
);\
}
#define ZM_RECON_ACCUM {\
asm(\
VONE(one)\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VMADD_MII_IR(psi_20,one,UChi_00,psi_20) \
VMADD_MII_IR(psi_21,one,UChi_01,psi_21) \
VMADD_MII_IR(psi_22,one,UChi_02,psi_22) \
VMADD_II_MIR(psi_30,one,UChi_10,psi_30) \
VMADD_II_MIR(psi_31,one,UChi_11,psi_31) \
VMADD_II_MIR(psi_32,one,UChi_12,psi_32) \
);\
}
// fspin(2)+=hspin(0);
// fspin(3)+=hspin(1);
#define TP_RECON_ACCUM {\
asm(\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VADD(psi_20,psi_20,UChi_00) VADD(psi_21,psi_21,UChi_01) VADD(psi_22,psi_22,UChi_02) \
VADD(psi_30,psi_30,UChi_10) VADD(psi_31,psi_31,UChi_11) VADD(psi_32,psi_32,UChi_12) \
);\
}
#define TM_RECON_ACCUM {\
asm(\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VSUB(psi_20,psi_20,UChi_00) VSUB(psi_21,psi_21,UChi_01) VSUB(psi_22,psi_22,UChi_02) \
VSUB(psi_30,psi_30,UChi_10) VSUB(psi_31,psi_31,UChi_11) VSUB(psi_32,psi_32,UChi_12) \
);\
}
#define ADD_RESULTi(PTR,pf) \
LOAD_CHIMU(PTR) \
asm( \
VADD(psi_00,chi_00,psi_00) VADD(psi_01,chi_01,psi_01) VADD(psi_02,chi_02,psi_02) \
VADD(psi_10,chi_10,psi_10) VADD(psi_11,chi_11,psi_11) VADD(psi_12,chi_12,psi_12) \
VADD(psi_20,chi_20,psi_20) VADD(psi_21,chi_21,psi_21) VADD(psi_22,chi_22,psi_22) \
VADD(psi_30,chi_30,psi_30) VADD(psi_31,chi_31,psi_31) VADD(psi_32,chi_32,psi_32) ); \
SAVE_RESULT(PTR,pf);
#define PERMUTE_DIR3
#define PERMUTE_DIR2
#define PERMUTE_DIR1
#define PERMUTE_DIR0 { \
asm( \
VPERMI(perm_reg) \
VPERM(Chi_00,perm_reg) VPERM(Chi_01,perm_reg) VPERM(Chi_02,perm_reg) \
VPERM(Chi_10,perm_reg) VPERM(Chi_11,perm_reg) VPERM(Chi_12,perm_reg) ); \
}
#endif

View File

@ -0,0 +1,46 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Avx512Asm.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// No guard; ok multi-include
#undef VSIZE
#undef VLOAD
#undef VLOADu
#undef VSPLAT
#undef VSTORE
#undef VSTOREu
#undef MULT_2SPIN_QPX_LS
#undef MULT_2SPIN_QPX
#define VSIZE VSIZEd
#define VLOAD(A,B,C) VLOADd(A,B,C)
#define VLOADu(A,B,C) VLOADud(A,B,C)
#define VSPLAT(A,B,DEST) VSPLATd(A,B,DEST)
#define VSTORE(A,B,C) VSTOREd(A,B,C)
#define VSTOREu(A,B,C) VSTOREud(A,B,C)
#define MULT_2SPIN_QPX_LS(ptr,p) MULT_2SPIN_QPX_LSd(ptr,p)
#define MULT_2SPIN_QPX(ptr,p) MULT_2SPIN_QPXd(ptr,p)

View File

@ -0,0 +1,46 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Avx512Asm.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// No guard; ok multi-include
#undef VSIZE
#undef VLOAD
#undef VLOADu
#undef VSPLAT
#undef VSTORE
#undef VSTOREu
#undef MULT_2SPIN_QPX_LS
#undef MULT_2SPIN_QPX
#define VSIZE VSIZEf
#define VLOAD(A,B,C) VLOADf(A,B,C)
#define VLOADu(A,B,C) VLOADuf(A,B,C)
#define VSPLAT(A,B,DEST) VSPLATf(A,B,DEST)
#define VSTORE(A,B,C) VSTOREf(A,B,C)
#define VSTOREu(A,B,C) VSTOREuf(A,B,C)
#define MULT_2SPIN_QPX_LS(ptr,p) MULT_2SPIN_QPX_LSf(ptr,p)
#define MULT_2SPIN_QPX(ptr,p) MULT_2SPIN_QPXf(ptr,p)

205
Grid/simd/Intel512avx.h Normal file
View File

@ -0,0 +1,205 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Avx512Asm.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ASM_AV512_H
#define GRID_ASM_AV512_H
////////////////////////////////////////////////////////////
// Knights Landing specials
////////////////////////////////////////////////////////////
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr)
#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr)
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
VSHUFMEMf(O,P,tmp) \
VMULMEMf(O,P,B,Biirr) \
VMULMEMf(O,P,C,Ciirr) \
VMULf(tmp,B,Briir) \
VMULf(tmp,C,Criir)
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
VSHUFMEMd(O,P,tmp) \
VMULMEMd(O,P,B,Biirr) \
VMULMEMd(O,P,C,Ciirr) \
VMULd(tmp,B,Briir) \
VMULd(tmp,C,Criir)
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
VSHUFMEMf(O,P,tmp) \
VMADDMEMf(O,P,B,Biirr) \
VMADDMEMf(O,P,C,Ciirr) \
VMADDf(tmp,B,Briir) \
VMADDf(tmp,C,Criir)
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
VSHUFMEMd(O,P,tmp) \
VMADDMEMd(O,P,B,Biirr) \
VMADDMEMd(O,P,C,Ciirr) \
VMADDd(tmp,B,Briir) \
VMADDd(tmp,C,Criir)
// Merges accumulation for complex dot chain; less efficient under avx512
#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n"\
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\
"vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n"
#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\
"vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
#define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2
#define VMOVRDUPf(OFF,PTR,DEST) "vmovsldup " #OFF "*64(" #PTR "), " #DEST ";\n"
#define VMOVIDUPf(OFF,PTR,DEST) "vmovshdup " #OFF "*64(" #PTR "), " #DEST ";\n"
#define VRDUPd(SRC,DEST) "vpshufd $0x44," #SRC"," #DEST ";\n" // 32 bit level: 1,0,3,2
#define VRDUPf(SRC,DEST) "vmovsldup " #SRC ", " #DEST ";\n"
#define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2
#define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n"
#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n"
#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n"
#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n"
#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n"
#define VBCASTCDUPf(OFF,A,DEST) "vbroadcastsd (" #OFF "*64 )(" #A ")," #DEST ";\n"
#define VBCASTZDUPf(OFF,A,DEST) "vbroadcastf32x4 (" #OFF "*64 )(" #A ")," #DEST ";\n"
#define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST)
#define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST)
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n"
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n"
#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps " #O"*64("#P "),"#B "," #accum ";\n"
#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n"
#define VMADDRDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDIDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
#define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
#define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDRDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
#define VMADDIDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
#define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
#define VMULIDUPd(O,P,B,accum) "vmulpd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
/*
* TimesI is used only in the XP recon
* Could zero the regs and use RECON_ACCUM
*/
#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESI0d(A,DEST, Z) VSHUFd(A,DEST)
#define VTIMESI1d(A,DEST, Z) "vaddpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESI2d(A,DEST, Z) "vsubpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESMINUSI0f(A,DEST,Z) VSHUFf(A,DEST)
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESMINUSI0d(A,DEST,Z) VSHUFd(A,DEST)
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#if 0
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#else
// o_p must point to floating 1.0f/d
//
// Ai, Ar -> tmp (r i)
// tmp *1.0
// ACC i - Ar ; ACC r + Ai
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
#define VACCTIMESMINUSI1f(A,ACC,tmp) VMADDMEMf(1,%r10,tmp,ACC)
#define VACCTIMESMINUSI2f(A,ACC,tmp)
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
#define VACCTIMESMINUSI1d(A,ACC,tmp) VMADDMEMd(1,%r10,tmp,ACC)
#define VACCTIMESMINUSI2d(A,ACC,tmp)
// Ai, Ar -> tmp (r i)
// tmp *1.0
// ACC i + Ar ; ACC r - Ai
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
#define VACCTIMESI1f(A,ACC,tmp) VMADDMEMf(0,%r10,tmp,ACC)
#define VACCTIMESI2f(A,ACC,tmp)
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
#define VACCTIMESI1d(A,ACC,tmp) VMADDMEMd(0,%r10,tmp,ACC)
#define VACCTIMESI2d(A,ACC,tmp)
#endif
#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n"
#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n"
#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n"
#define VPERM3f(A,B) "vshufps $0xb1," #A "," #B "," #B ";\n"
#define VPERM0d(A,B) "vshuff64x2 $0x4e," #A "," #B "," #B ";\n"
#define VPERM1d(A,B) "vshuff64x2 $0xb1," #A "," #B "," #B ";\n"
#define VPERM2d(A,B) "vshufpd $0x55," #A "," #B "," #B ";\n"
#define VPERM3d(A,B) VMOVd(A,B)
#endif

159
Grid/simd/Intel512common.h Normal file
View File

@ -0,0 +1,159 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Avx512Asm.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ASM_INTEL_COMMON_512_H
#define GRID_ASM_INTEL_COMMON_512_H
////////////////////////////////////////////////////////////////////////////////////////////////////
// Peformance options
////////////////////////////////////////////////////////////////////////////////////////////////////
#undef AVX512_PF_L2_WRITE
////////////////////////////////////////////////////////////////////////////////////////////////////
// Opcodes common
////////////////////////////////////////////////////////////////////////////////////////////////////
#define MASK_REGS \
__asm__ ("mov $0xAAAA, %%eax \n"\
"kmovw %%eax, %%k6 \n"\
"mov $0x5555, %%eax \n"\
"kmovw %%eax, %%k7 \n" : : : "%eax");
//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" );
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
#define VTIMESIf(A,DEST, Z) \
VTIMESI0f(A,DEST, Z) \
VTIMESI1f(A,DEST, Z) \
VTIMESI2f(A,DEST, Z)
#define VTIMESId(A,DEST, Z) \
VTIMESI0d(A,DEST, Z) \
VTIMESI1d(A,DEST, Z) \
VTIMESI2d(A,DEST, Z)
#define VTIMESMINUSIf(A,DEST, Z) \
VTIMESMINUSI0f(A,DEST, Z) \
VTIMESMINUSI1f(A,DEST, Z) \
VTIMESMINUSI2f(A,DEST, Z)
#define VTIMESMINUSId(A,DEST, Z) \
VTIMESMINUSI0d(A,DEST, Z) \
VTIMESMINUSI1d(A,DEST, Z) \
VTIMESMINUSI2d(A,DEST, Z)
#define VACCTIMESIf(A,ACC,tmp) \
VACCTIMESI0f(A,ACC,tmp) \
VACCTIMESI1f(A,ACC,tmp) \
VACCTIMESI2f(A,ACC,tmp)
#define VACCTIMESId(A,ACC,tmp) \
VACCTIMESI0d(A,ACC,tmp) \
VACCTIMESI1d(A,ACC,tmp) \
VACCTIMESI2d(A,ACC,tmp)
#define VACCTIMESMINUSIf(A,ACC,tmp) \
VACCTIMESMINUSI0f(A,ACC,tmp) \
VACCTIMESMINUSI1f(A,ACC,tmp) \
VACCTIMESMINUSI2f(A,ACC,tmp)
#define VACCTIMESMINUSId(A,ACC,tmp) \
VACCTIMESMINUSI0d(A,ACC,tmp) \
VACCTIMESMINUSI1d(A,ACC,tmp) \
VACCTIMESMINUSI2d(A,ACC,tmp)
#define LOAD64a(A,ptr) "movq %0, %" #A : : "r"(ptr) : #A
#define LOAD64i(A,ptr) __asm__ ( LOAD64a(A,ptr));
#define LOAD64(A,ptr) LOAD64i(A,ptr)
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n"
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
#ifdef AVX512_PF_L2_WRITE
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
#else
#define VPREFETCHW(O,A)
#endif
#define VPREFETCHNTA(O,A)
#define VPREFETCH(O,A)
#define VEVICT(O,A)
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
// "clevict0 "#O"*64("#A");\n"
#define VLOADf(OFF,PTR,DEST) "vmovups " #OFF "*64(" #PTR "), " #DEST ";\n"
#define VLOADd(OFF,PTR,DEST) "vmovupd " #OFF "*64(" #PTR "), " #DEST ";\n"
#define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n"
#define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n"
#define VSUBf(A,B,DEST) "vsubps " #A "," #B "," #DEST ";\n"
#define VSUBd(A,B,DEST) "vsubpd " #A "," #B "," #DEST ";\n"
#define VADDMEMf(O,A,B,DEST) "vaddps "#O"*64("#A ")," #B "," #DEST ";\n"
#define VADDMEMd(O,A,B,DEST) "vaddpd "#O"*64("#A ")," #B "," #DEST ";\n"
#define VSUBMEMf(O,A,B,DEST) "vsubps "#O"*64("#A ")," #B "," #DEST ";\n"
#define VSUBMEMd(O,A,B,DEST) "vsubpd "#O"*64("#A ")," #B "," #DEST ";\n"
#define VMULf(A,B,DEST) "vmulps " #A "," #B "," #DEST ";\n"
#define VMULd(A,B,DEST) "vmulpd " #A "," #B "," #DEST ";\n"
#define VMADDf(A,B,DEST) "vfmadd231ps " #A "," #B "," #DEST ";\n"
#define VMADDd(A,B,DEST) "vfmadd231pd " #A "," #B "," #DEST ";\n"
#define VMULMEMf(O,A,B,DEST) "vmulps " #O"*64("#A ")," #B "," #DEST ";\n"
#define VMULMEMd(O,A,B,DEST) "vmulpd " #O"*64("#A ")," #B "," #DEST ";\n"
#define VMADDMEMf(O,A,B,DEST) "vfmadd231ps " #O"*64("#A "),"#B "," #DEST ";\n"
#define VMADDMEMd(O,A,B,DEST) "vfmadd231pd " #O"*64("#A "),"#B "," #DEST ";\n"
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
#define STREAM_STORE
#ifdef STREAM_STORE
#define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#else
#define VSTOREf(OFF,PTR,SRC) "vmovups " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSTOREd(OFF,PTR,SRC) "vmovupd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#endif
// Swaps Re/Im ; could unify this with IMCI
#define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n"
#define VSHUFf(A,DEST) "vpshufd $0xb1," #A "," #DEST ";\n"
#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 2,3,0,1
#define TRAP " int3 ;\n"
#endif

156
Grid/simd/Intel512double.h Normal file
View File

@ -0,0 +1,156 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Avx512Asm.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// No guard can be multiply included as undef clearage
#undef VZERO
#undef VMOV
#undef VLOAD
#undef VSTORE
#define VZERO(A) VZEROd(A)
#define VMOV(A,B) VMOVd(A,B)
#define VLOAD(OFF,PTR,DEST) VLOADd(OFF,PTR,DEST)
#define VSTORE(OFF,PTR,SRC) VSTOREd(OFF,PTR,SRC)
#undef VADD
#undef VSUB
#undef VMUL
#undef VMADD
#define VADD(A,B,C) VADDd(A,B,C)
#define VSUB(A,B,C) VSUBd(A,B,C)
#define VMUL(Uri,Uir,Chi) VMULd(Uri,Uir,Chi)
#define VMADD(Uri,Uir,Chi) VMADDd(Uri,Uir,Chi)
#undef VTIMESI
#undef VTIMESI0
#undef VTIMESI1
#undef VTIMESI2
#define VTIMESI(A,B,C) VTIMESId(A,B,C)
#define VTIMESI0(A,B,C) VTIMESI0d(A,B,C)
#define VTIMESI1(A,B,C) VTIMESI1d(A,B,C)
#define VTIMESI2(A,B,C) VTIMESI2d(A,B,C)
#undef VTIMESMINUSI
#undef VTIMESMINUSI0
#undef VTIMESMINUSI1
#undef VTIMESMINUSI2
#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C)
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0d(A,B,C)
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1d(A,B,C)
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2d(A,B,C)
#undef VACCTIMESI
#undef VACCTIMESI0
#undef VACCTIMESI1
#undef VACCTIMESI2
#define VACCTIMESI(A,B,C) VACCTIMESId(A,B,C)
#define VACCTIMESI0(A,B,C) VACCTIMESI0d(A,B,C)
#define VACCTIMESI1(A,B,C) VACCTIMESI1d(A,B,C)
#define VACCTIMESI2(A,B,C) VACCTIMESI2d(A,B,C)
#undef VACCTIMESMINUSI
#undef VACCTIMESMINUSI0
#undef VACCTIMESMINUSI1
#undef VACCTIMESMINUSI2
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSId(A,B,C)
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0d(A,B,C)
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1d(A,B,C)
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2d(A,B,C)
#undef VACCTIMESI1MEM
#undef VACCTIMESI2MEM
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMd(A,ACC,O,P)
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMd(A,ACC,O,P)
#undef VACCTIMESMINUSI1MEM
#undef VACCTIMESMINUSI2MEM
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
#undef VPERM0
#undef VPERM1
#undef VPERM2
#undef VPERM3
#define VPERM0(A,B) VPERM0d(A,B)
#define VPERM1(A,B) VPERM1d(A,B)
#define VPERM2(A,B) VPERM2d(A,B)
#define VPERM3(A,B) VPERM3d(A,B)
#undef VSHUFMEM
#undef VADDMEM
#undef VSUBMEM
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMd(OFF,A,DEST)
#define VADDMEM(O,A,B,C) VADDMEMd(O,A,B,C)
#define VSUBMEM(O,A,B,C) VSUBMEMd(O,A,B,C)
#undef VMOVIDUP
#undef VMOVRDUP
#undef VMADDSUB
#undef VSHUF
#define VMOVIDUP(A,B,C) VMOVIDUPd(A,B,C)
#define VMOVRDUP(A,B,C) VMOVRDUPd(A,B,C)
#define VMADDSUB(A,B,accum) VMADDSUBd(A,B,accum)
#define VSHUF(A,B) VSHUFd(A,B)
#undef ZEND1
#undef ZEND2
#undef ZLOAD
#undef ZMUL
#undef ZMADD
#undef ZMULMEM2SP
#undef ZMADDMEM2SP
#define ZEND1(A,B,C) ZEND1d(A,B,C)
#define ZEND2(A,B,C) ZEND2d(A,B,C)
#define ZLOAD(A,B,C,D) ZLOADd(A,B,C,D)
#define ZMUL(A,B,C,D,E) ZMULd(A,B,C,D,E)
#define ZMADD(A,B,C,D,E) ZMADDd(A,B,C,D,E)
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#undef VRDUP
#undef VIDUP
#undef VMADDSUBMEM
#undef VMADDMEM
#undef VMULMEM
#define VRDUP(SRC,DEST) VRDUPd(SRC,DEST)
#define VIDUP(SRC,DEST) VIDUPd(SRC,DEST)
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
#define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum)
#define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum)
#undef VMADDRDUP
#undef VMADDSUBRDUP
#undef VMADDSUBIDUP
#undef VMULRDUP
#undef VMULIDUP
#define VMADDRDUP(O,P,B,accum) VMADDRDUPd(O,P,B,accum)
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum)
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum)
#define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum)
#define VMULIDUP(O,P,B,accum) VMULIDUPd(O,P,B,accum)

127
Grid/simd/Intel512imci.h Normal file
View File

@ -0,0 +1,127 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Avx512Asm.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ASM_AV512_H
#define GRID_ASM_AV512_H
////////////////////////////////////////////////////////////
// Knights Corner specials
////////////////////////////////////////////////////////////
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr)
#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr)
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
VSHUFMEMf(O,P,tmp) \
VMULMEMf(O,P,B,Biirr) \
VMULMEMf(O,P,C,Ciirr) \
VMULf(tmp,B,Briir) \
VMULf(tmp,C,Criir)
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
VSHUFMEMd(O,P,tmp) \
VMULMEMd(O,P,B,Biirr) \
VMULMEMd(O,P,C,Ciirr) \
VMULd(tmp,B,Briir) \
VMULd(tmp,C,Criir)
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
VSHUFMEMf(O,P,tmp) \
VMADDMEMf(O,P,B,Biirr) \
VMADDMEMf(O,P,C,Ciirr) \
VMADDf(tmp,B,Briir) \
VMADDf(tmp,C,Criir)
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
VSHUFMEMd(O,P,tmp) \
VMADDMEMd(O,P,B,Biirr) \
VMADDMEMd(O,P,C,Ciirr) \
VMADDd(tmp,B,Briir) \
VMADDd(tmp,C,Criir)
#define ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
#define ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
#define ZEND1f(Criir,Ciirr, tmp) "vaddps " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
#define ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
#define VTIMESI0f(A,DEST, Z)
#define VTIMESI1f(A,DEST, Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESI2f(A,DEST, Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESI0d(A,DEST, Z)
#define VTIMESI1d(A,DEST, Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESI2d(A,DEST, Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESMINUSI0f(A,DEST,Z)
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESMINUSI0d(A,DEST,Z)
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
#define VACCTIMESI0f(A,ACC,tmp)
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESI0d(A,ACC,tmp)
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESMINUSI0f(A,ACC,tmp)
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
// Acc = Acc - i A
#define VACCTIMESMINUSI0d(A,ACC,tmp)
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
#define VPERM0f(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n"
#define VPERM1f(A,B) "vpermf32x4 $0xb1," #A "," #B ";\n"
#define VPERM2f(A,B) "vmovaps " #A "{badc}," #B ";\n"
#define VPERM3f(A,B) "vmovaps " #A "{cdab}," #B ";\n"
#define VPERM0d(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n"
#define VPERM1d(A,B) "vmovapd " #A "{badc}," #B ";\n"
#define VPERM2d(A,B) "vmovapd " #A "{cdab}," #B ";\n"
#define VPERM3d(A,B) VMOVd(A,B)
#endif

157
Grid/simd/Intel512single.h Normal file
View File

@ -0,0 +1,157 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Avx512Asm.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// No guard can be multiply included as undef clearge of macros
#undef VZERO
#undef VMOV
#undef VLOAD
#undef VSTORE
#define VZERO(A) VZEROf(A)
#define VMOV(A,B) VMOVf(A,B)
#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST)
#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC)
#undef VADD
#undef VSUB
#undef VMUL
#undef VMADD
#define VADD(A,B,C) VADDf(A,B,C)
#define VSUB(A,B,C) VSUBf(A,B,C)
#define VMUL(Uri,Uir,Chi) VMULf(Uri,Uir,Chi)
#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
#undef VTIMESI
#undef VTIMESI0
#undef VTIMESI1
#undef VTIMESI2
#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C)
#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C)
#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C)
#undef VTIMESMINUSI
#undef VTIMESMINUSI0
#undef VTIMESMINUSI1
#undef VTIMESMINUSI2
#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C)
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C)
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C)
#undef VACCTIMESI
#undef VACCTIMESI0
#undef VACCTIMESI1
#undef VACCTIMESI2
#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C)
#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C)
#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C)
#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C)
#undef VACCTIMESMINUSI
#undef VACCTIMESMINUSI0
#undef VACCTIMESMINUSI1
#undef VACCTIMESMINUSI2
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C)
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C)
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C)
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C)
#undef VACCTIMESI1MEM
#undef VACCTIMESI2MEM
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
#undef VACCTIMESMINUSI1MEM
#undef VACCTIMESMINUSI2MEM
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
#undef VPERM0
#undef VPERM1
#undef VPERM2
#undef VPERM3
#define VPERM0(A,B) VPERM0f(A,B)
#define VPERM1(A,B) VPERM1f(A,B)
#define VPERM2(A,B) VPERM2f(A,B)
#define VPERM3(A,B) VPERM3f(A,B)
#undef VSHUFMEM
#undef VADDMEM
#undef VSUBMEM
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST)
#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C)
#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C)
#undef VMOVIDUP
#undef VMOVRDUP
#undef VMADDSUB
#undef VSHUF
#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum)
#define VSHUF(A,B) VSHUFf(A,B)
#undef ZEND1
#undef ZEND2
#undef ZLOAD
#undef ZMUL
#undef ZMADD
#undef ZMULMEM2SP
#undef ZMADDMEM2SP
#define ZEND1(A,B,C) ZEND1f(A,B,C)
#define ZEND2(A,B,C) ZEND2f(A,B,C)
#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D)
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#undef VRDUP
#undef VIDUP
#undef VMADDSUBMEM
#undef VMADDMEM
#undef VMULMEM
#define VRDUP(SRC,DEST) VRDUPf(SRC,DEST)
#define VIDUP(SRC,DEST) VIDUPf(SRC,DEST)
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
#undef VMADDRDUP
#undef VMADDSUBRDUP
#undef VMADDSUBIDUP
#undef VMULRDUP
#undef VMULIDUP
#define VMADDRDUP(O,P,B,accum) VMADDRDUPf(O,P,B,accum)
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum)
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum)
#define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum)
#define VMULIDUP(O,P,B,accum) VMULIDUPf(O,P,B,accum)

938
Grid/simd/Intel512wilson.h Normal file
View File

@ -0,0 +1,938 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Avx512Asm.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ASM_INTEL_512_QCD_H
#define GRID_ASM_INTEL_512_QCD_H
//////////////////////////////////////////////////////////////////////////////////////////
// Register allocations for Wilson Kernel are precision indept
//////////////////////////////////////////////////////////////////////////////////////////
#define psi_00 %zmm0
#define psi_01 %zmm1
#define psi_02 %zmm2
#define psi_10 %zmm3
#define psi_11 %zmm4
#define psi_12 %zmm5
#define psi_20 %zmm6
#define psi_21 %zmm7
#define psi_22 %zmm8
#define psi_30 %zmm9
#define psi_31 %zmm10
#define psi_32 %zmm11
#define Chi_00 %zmm12
#define Chi_01 %zmm13
#define Chi_02 %zmm14
#define Chi_10 %zmm15
#define Chi_11 %zmm16
#define Chi_12 %zmm17
#define UChi_00 %zmm18
#define UChi_01 %zmm19
#define UChi_02 %zmm20
#define UChi_10 %zmm21
#define UChi_11 %zmm22
#define UChi_12 %zmm23
#define Uir %zmm24
#define Uri %zmm25
#define T1 %zmm24
#define T2 %zmm25
#define Z0 %zmm26
#define Z1 %zmm27
#define Z2 %zmm28
#define Z3 %zmm29
#define Z4 %zmm30
#define Z5 %zmm31
#define TMP Chi_00
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_12
#include "Intel512common.h"
#include "Intel512avx.h"
//////////////////////////////////////////////////////////////////
// Macros used to build wilson kernel -- can rationalise and simplify
// a little as some duplication developed during trying different
// variants during optimisation. Could cut back to only those used.
//////////////////////////////////////////////////////////////////
#define LOCK_GAUGE(dir)
#define UNLOCK_GAUGE(dir)
// const SiteSpinor * ptr = & in._odata[offset];
#define LOAD_CHIMU(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIMUi );
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
#define ADD_RESULT(PT,R) ADD_RESULTi(PT,R)
#define ZERO_PSI \
asm( VZERO(psi_00) \
VZERO(psi_01) \
VZERO(psi_02) \
VZERO(psi_10) \
VZERO(psi_11) \
VZERO(psi_12) \
VZERO(psi_20) \
VZERO(psi_21) \
VZERO(psi_22) \
VZERO(psi_30) \
VZERO(psi_31) \
VZERO(psi_32));
#define LOAD_CHIMUi \
LOAD_CHIMU01i \
LOAD_CHIMU23i
#define LOAD_CHIMU01i \
VLOAD(0,%r8,Chimu_00) \
VLOAD(1,%r8,Chimu_01) \
VLOAD(2,%r8,Chimu_02) \
VLOAD(3,%r8,Chimu_10) \
VLOAD(4,%r8,Chimu_11) \
VLOAD(5,%r8,Chimu_12)
#define LOAD_CHIMU23i \
VLOAD(6,%r8,Chimu_20) \
VLOAD(7,%r8,Chimu_21) \
VLOAD(8,%r8,Chimu_22) \
VLOAD(9,%r8,Chimu_30) \
VLOAD(10,%r8,Chimu_31) \
VLOAD(11,%r8,Chimu_32)
#define SHUF_CHIMU23i\
VSHUFMEM(6,%r8,Chimu_20) \
VSHUFMEM(7,%r8,Chimu_21) \
VSHUFMEM(8,%r8,Chimu_22) \
VSHUFMEM(9,%r8,Chimu_30) \
VSHUFMEM(10,%r8,Chimu_31) \
VSHUFMEM(11,%r8,Chimu_32)
#define LOAD_CHIi \
VLOAD(0,%r8,Chi_00) \
VLOAD(1,%r8,Chi_01) \
VLOAD(2,%r8,Chi_02) \
VLOAD(3,%r8,Chi_10) \
VLOAD(4,%r8,Chi_11) \
VLOAD(5,%r8,Chi_12)
#define SAVE_UCHIi(PTR) \
LOAD64(%r8,PTR) \
__asm__ ( \
VSTORE(0,%r8,UChi_00) \
VSTORE(1,%r8,UChi_01) \
VSTORE(2,%r8,UChi_02) \
VSTORE(3,%r8,UChi_10) \
VSTORE(4,%r8,UChi_11) \
VSTORE(5,%r8,UChi_12) );
#define SAVE_CHIi(PTR) \
LOAD64(%r8,PTR) \
__asm__ ( \
VSTORE(0,%r8,Chi_00) \
VSTORE(1,%r8,Chi_01) \
VSTORE(2,%r8,Chi_02) \
VSTORE(3,%r8,Chi_10) \
VSTORE(4,%r8,Chi_11) \
VSTORE(5,%r8,Chi_12) );
#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p)
#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
//////////////////////////////////////////////////////////////////
// Dirac algebra
//////////////////////////////////////////////////////////////////
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
#define XP_PROJMEM(PTR) \
LOAD64(%r8,PTR) \
__asm__ ( \
LOAD_CHIi \
SHUF_CHIMU23i \
VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \
VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \
VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \
VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \
VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \
VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \
VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \
VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \
VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \
VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \
VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \
VACCTIMESI2(Chi_12,Chi_12,Chimu_22) );
#define YP_PROJMEM(ptr) \
LOAD64(%r8,ptr) \
__asm__ ( \
LOAD_CHIMU01i \
VSUBMEM(9,%r8 ,Chimu_00,Chi_00) \
VSUBMEM(10,%r8,Chimu_01,Chi_01) \
VSUBMEM(11,%r8,Chimu_02,Chi_02) \
VADDMEM(6,%r8,Chimu_10,Chi_10) \
VADDMEM(7,%r8,Chimu_11,Chi_11) \
VADDMEM(8,%r8,Chimu_12,Chi_12) );
#define ZP_PROJMEM(PTR) \
LOAD64(%r8,PTR) \
__asm__ ( \
LOAD_CHIi \
SHUF_CHIMU23i \
VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \
VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \
VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \
VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \
VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \
VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30) \
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31) \
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32) );
#define TP_PROJMEM(ptr) \
LOAD64(%r8,ptr) \
__asm__ ( \
LOAD_CHIMU01i \
VADDMEM(6,%r8 ,Chimu_00,Chi_00) \
VADDMEM(7,%r8,Chimu_01,Chi_01) \
VADDMEM(8,%r8,Chimu_02,Chi_02) \
VADDMEM(9,%r8,Chimu_10,Chi_10) \
VADDMEM(10,%r8,Chimu_11,Chi_11) \
VADDMEM(11,%r8,Chimu_12,Chi_12) );
// hspin(0)=fspin(0)-timesI(fspin(3))
// hspin(1)=fspin(1)-timesI(fspin(2))
#define XM_PROJMEM(PTR) \
LOAD64(%r8,PTR)\
__asm__ ( \
LOAD_CHIi \
SHUF_CHIMU23i \
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
#define YM_PROJMEM(ptr) \
LOAD64(%r8,ptr) \
__asm__ ( \
LOAD_CHIMU01i \
VADDMEM(9,%r8 ,Chimu_00,Chi_00) \
VADDMEM(10,%r8,Chimu_01,Chi_01) \
VADDMEM(11,%r8,Chimu_02,Chi_02) \
VSUBMEM(6,%r8,Chimu_10,Chi_10) \
VSUBMEM(7,%r8,Chimu_11,Chi_11) \
VSUBMEM(8,%r8,Chimu_12,Chi_12) );
#define ZM_PROJMEM(PTR) \
LOAD64(%r8,PTR) \
__asm__ ( \
LOAD_CHIi \
SHUF_CHIMU23i \
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\
VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\
VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\
VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\
VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\
VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
#define TM_PROJMEM(ptr) \
LOAD64(%r8,ptr) \
__asm__ ( \
LOAD_CHIMU01i \
VSUBMEM(6,%r8,Chimu_00,Chi_00) \
VSUBMEM(7,%r8,Chimu_01,Chi_01) \
VSUBMEM(8,%r8,Chimu_02,Chi_02) \
VSUBMEM(9,%r8,Chimu_10,Chi_10) \
VSUBMEM(10,%r8,Chimu_11,Chi_11) \
VSUBMEM(11,%r8,Chimu_12,Chi_12) );
// fspin(0)=hspin(0)
// fspin(1)=hspin(1)
// fspin(2)=timesMinusI(hspin(1))
// fspin(3)=timesMinusI(hspin(0))
#define XP_RECON __asm__ ( \
VZERO(TMP) \
VTIMESMINUSI0(UChi_00,psi_30,TMP) \
VTIMESMINUSI0(UChi_10,psi_20,TMP) \
VTIMESMINUSI0(UChi_01,psi_31,TMP) \
VTIMESMINUSI0(UChi_11,psi_21,TMP) \
VTIMESMINUSI0(UChi_02,psi_32,TMP) \
VTIMESMINUSI0(UChi_12,psi_22,TMP) \
VMOV(UChi_00,psi_00) \
VMOV(UChi_10,psi_10) \
VMOV(UChi_01,psi_01) \
VMOV(UChi_11,psi_11) \
VMOV(UChi_02,psi_02) \
VMOV(UChi_12,psi_12) \
VTIMESMINUSI1(UChi_10,psi_20,TMP) \
VTIMESMINUSI1(UChi_11,psi_21,TMP) \
VTIMESMINUSI1(UChi_12,psi_22,TMP) \
VTIMESMINUSI1(UChi_00,psi_30,TMP) \
VTIMESMINUSI1(UChi_01,psi_31,TMP) \
VTIMESMINUSI1(UChi_02,psi_32,TMP) \
VTIMESMINUSI2(UChi_10,psi_20,TMP) \
VTIMESMINUSI2(UChi_11,psi_21,TMP) \
VTIMESMINUSI2(UChi_12,psi_22,TMP) \
VTIMESMINUSI2(UChi_00,psi_30,TMP) \
VTIMESMINUSI2(UChi_01,psi_31,TMP) \
VTIMESMINUSI2(UChi_02,psi_32,TMP) \
);
// NB could save 6 ops using addsub => 12 cycles
#define XP_RECON_ACCUM __asm__ ( \
VZERO(TMP)\
VACCTIMESMINUSI0(UChi_00,psi_30,Z3)\
VACCTIMESMINUSI0(UChi_10,psi_20,Z0)\
VACCTIMESMINUSI0(UChi_01,psi_31,Z4)\
VACCTIMESMINUSI0(UChi_11,psi_21,Z1)\
VACCTIMESMINUSI0(UChi_02,psi_32,Z5)\
VACCTIMESMINUSI0(UChi_12,psi_22,Z2)\
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VACCTIMESMINUSI1(UChi_00,psi_30,Z3)\
VACCTIMESMINUSI1(UChi_10,psi_20,Z0)\
VACCTIMESMINUSI1(UChi_01,psi_31,Z4)\
VACCTIMESMINUSI1(UChi_11,psi_21,Z1)\
VACCTIMESMINUSI1(UChi_02,psi_32,Z5)\
VACCTIMESMINUSI1(UChi_12,psi_22,Z2)\
VACCTIMESMINUSI2(UChi_10,psi_20,Z0)\
VACCTIMESMINUSI2(UChi_11,psi_21,Z1)\
VACCTIMESMINUSI2(UChi_12,psi_22,Z2)\
VACCTIMESMINUSI2(UChi_00,psi_30,Z3)\
VACCTIMESMINUSI2(UChi_01,psi_31,Z4)\
VACCTIMESMINUSI2(UChi_02,psi_32,Z5)\
);
#define XM_RECON __asm__ ( \
VZERO(TMP)\
VTIMESI0(UChi_00,psi_30,TMP)\
VTIMESI0(UChi_10,psi_20,TMP)\
VTIMESI0(UChi_01,psi_31,TMP)\
VTIMESI0(UChi_11,psi_21,TMP)\
VTIMESI0(UChi_02,psi_32,TMP)\
VTIMESI0(UChi_12,psi_22,TMP)\
VMOV(UChi_00,psi_00)\
VMOV(UChi_10,psi_10)\
VMOV(UChi_01,psi_01)\
VMOV(UChi_11,psi_11)\
VMOV(UChi_02,psi_02)\
VMOV(UChi_12,psi_12)\
VTIMESI1(UChi_00,psi_30,TMP)\
VTIMESI1(UChi_10,psi_20,TMP)\
VTIMESI1(UChi_01,psi_31,TMP)\
VTIMESI1(UChi_11,psi_21,TMP)\
VTIMESI1(UChi_02,psi_32,TMP)\
VTIMESI1(UChi_12,psi_22,TMP)\
VTIMESI2(UChi_10,psi_20,TMP)\
VTIMESI2(UChi_11,psi_21,TMP)\
VTIMESI2(UChi_12,psi_22,TMP)\
VTIMESI2(UChi_00,psi_30,TMP)\
VTIMESI2(UChi_01,psi_31,TMP)\
VTIMESI2(UChi_02,psi_32,TMP)\
);
#define XM_RECON_ACCUM __asm__ ( \
VACCTIMESI0(UChi_10,psi_20,Z0)\
VACCTIMESI0(UChi_00,psi_30,Z3)\
VACCTIMESI0(UChi_11,psi_21,Z1)\
VACCTIMESI0(UChi_01,psi_31,Z4)\
VACCTIMESI0(UChi_12,psi_22,Z2)\
VACCTIMESI0(UChi_02,psi_32,Z5)\
\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_12,psi_12,psi_12)\
VADD(UChi_02,psi_02,psi_02)\
\
VACCTIMESI1(UChi_10,psi_20,Z0)\
VACCTIMESI1(UChi_00,psi_30,Z3)\
VACCTIMESI1(UChi_11,psi_21,Z1)\
VACCTIMESI1(UChi_01,psi_31,Z4)\
VACCTIMESI1(UChi_12,psi_22,Z2)\
VACCTIMESI1(UChi_02,psi_32,Z5)\
VACCTIMESI2(UChi_10,psi_20,Z0)\
VACCTIMESI2(UChi_11,psi_21,Z1)\
VACCTIMESI2(UChi_12,psi_22,Z2)\
VACCTIMESI2(UChi_00,psi_30,Z3)\
VACCTIMESI2(UChi_01,psi_31,Z4)\
VACCTIMESI2(UChi_02,psi_32,Z5)\
);
#define YP_RECON_ACCUM __asm__ ( \
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VADD(UChi_10,psi_20,psi_20)\
VADD(UChi_11,psi_21,psi_21)\
VADD(UChi_12,psi_22,psi_22)\
VSUB(UChi_00,psi_30,psi_30)\
VSUB(UChi_01,psi_31,psi_31)\
VSUB(UChi_02,psi_32,psi_32) );
#define YM_RECON_ACCUM __asm__ ( \
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VSUB(UChi_10,psi_20,psi_20)\
VSUB(UChi_11,psi_21,psi_21)\
VSUB(UChi_12,psi_22,psi_22)\
VADD(UChi_00,psi_30,psi_30)\
VADD(UChi_01,psi_31,psi_31)\
VADD(UChi_02,psi_32,psi_32) );
#define ZP_RECON_ACCUM __asm__ ( \
VACCTIMESMINUSI0(UChi_00,psi_20,Z0)\
VACCTIMESI0(UChi_10,psi_30,Z3)\
VACCTIMESMINUSI0(UChi_01,psi_21,Z1)\
VACCTIMESI0(UChi_11,psi_31,Z4)\
VACCTIMESMINUSI0(UChi_02,psi_22,Z2)\
VACCTIMESI0(UChi_12,psi_32,Z5)\
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VACCTIMESMINUSI1(UChi_00,psi_20,Z0)\
VACCTIMESI1(UChi_10,psi_30,Z3)\
VACCTIMESMINUSI1(UChi_01,psi_21,Z1)\
VACCTIMESI1(UChi_11,psi_31,Z4)\
VACCTIMESMINUSI1(UChi_02,psi_22,Z2)\
VACCTIMESI1(UChi_12,psi_32,Z5)\
VACCTIMESMINUSI2(UChi_00,psi_20,Z0)\
VACCTIMESMINUSI2(UChi_01,psi_21,Z1)\
VACCTIMESMINUSI2(UChi_02,psi_22,Z2)\
VACCTIMESI2(UChi_10,psi_30,Z3)\
VACCTIMESI2(UChi_11,psi_31,Z4)\
VACCTIMESI2(UChi_12,psi_32,Z5)\
);
#define ZM_RECON_ACCUM __asm__ ( \
VACCTIMESI0(UChi_00,psi_20,Z0)\
VACCTIMESMINUSI0(UChi_10,psi_30,Z3)\
VACCTIMESI0(UChi_01,psi_21,Z1)\
VACCTIMESMINUSI0(UChi_11,psi_31,Z4)\
VACCTIMESI0(UChi_02,psi_22,Z2)\
VACCTIMESMINUSI0(UChi_12,psi_32,Z5)\
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VACCTIMESI1(UChi_00,psi_20,Z0)\
VACCTIMESMINUSI1(UChi_10,psi_30,Z3)\
VACCTIMESI1(UChi_01,psi_21,Z1)\
VACCTIMESMINUSI1(UChi_11,psi_31,Z4)\
VACCTIMESI1(UChi_02,psi_22,Z2)\
VACCTIMESMINUSI1(UChi_12,psi_32,Z5)\
VACCTIMESI2(UChi_00,psi_20,Z0)\
VACCTIMESI2(UChi_01,psi_21,Z1)\
VACCTIMESI2(UChi_02,psi_22,Z2)\
VACCTIMESMINUSI2(UChi_10,psi_30,Z3)\
VACCTIMESMINUSI2(UChi_11,psi_31,Z4)\
VACCTIMESMINUSI2(UChi_12,psi_32,Z5)\
);
#define TP_RECON_ACCUM __asm__ ( \
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VADD(UChi_00,psi_20,psi_20)\
VADD(UChi_10,psi_30,psi_30)\
VADD(UChi_01,psi_21,psi_21)\
VADD(UChi_11,psi_31,psi_31)\
VADD(UChi_02,psi_22,psi_22)\
VADD(UChi_12,psi_32,psi_32) );
#define TM_RECON_ACCUM __asm__ ( \
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VSUB(UChi_00,psi_20,psi_20)\
VSUB(UChi_10,psi_30,psi_30)\
VSUB(UChi_01,psi_21,psi_21)\
VSUB(UChi_11,psi_31,psi_31)\
VSUB(UChi_02,psi_22,psi_22)\
VSUB(UChi_12,psi_32,psi_32) );
#define AVX512_PF_L1
#define AVX512_PF_L2_GAUGE
#define AVX512_PF_L2_TABLE
#undef AVX512_PF_L2_LINEAR
#ifdef AVX512_PF_L2_TABLE
// P1 Fetches the base pointer for next link into L1 with P1
// M1 Fetches the next site pointer into L2
#define VPREFETCH_P1(A,B) VPREFETCH1(A,B)
#define VPREFETCH_P2(A,B)
#define VPREFETCH_M1(A,B) VPREFETCH2(A,B)
#define VPREFETCH_M2(A,B)
#endif
#ifdef AVX512_PF_L2_LINEAR
#define VPREFETCH_M1(A,B) VPREFETCH1(A,B)
#define VPREFETCH_M2(A,B) VPREFETCH2(A,B)
#define VPREFETCH_P1(A,B)
#define VPREFETCH_P2(A,B)
#endif
#ifdef AVX512_PF_L2_GAUGE
#define VPREFETCH_G1(A,B) VPREFETCH1(A,B)
#define VPREFETCH_G2(A,B) VPREFETCH2(A,B)
#endif
#define PF_GAUGE(A) \
LOAD64(%r8,&U._odata[sU](A)) \
__asm__ ( \
VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8) \
VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8) \
);
#define SAVE_RESULTi(PTR,pf) \
LOAD64(%r8,PTR) \
LOAD64(%r9,pf) \
__asm__ ( \
VSTORE(0,%r8,psi_00) VPREFETCH_M1(0,%r9) \
VSTORE(1,%r8,psi_01) VPREFETCH_M1(1,%r9) \
VSTORE(2,%r8,psi_02) VPREFETCH_M1(2,%r9) \
VSTORE(3,%r8,psi_10) VPREFETCH_M1(3,%r9) \
VSTORE(4,%r8,psi_11) VPREFETCH_M1(4,%r9) \
VSTORE(5,%r8,psi_12) VPREFETCH_M1(5,%r9) \
VSTORE(6,%r8,psi_20) VPREFETCH_M1(6,%r9) \
VSTORE(7,%r8,psi_21) VPREFETCH_M1(7,%r9) \
VSTORE(8,%r8,psi_22) VPREFETCH_M1(8,%r9) \
VSTORE(9,%r8,psi_30) VPREFETCH_M1(9,%r9) \
VSTORE(10,%r8,psi_31) VPREFETCH_M1(10,%r9) \
VSTORE(11,%r8,psi_32) VPREFETCH_M1(11,%r9) \
);
#define ADD_RESULTi(PTR,pf) \
LOAD_CHIMU(PTR); \
asm(VADD(psi_00,Chimu_00,psi_00) VADD(psi_01,Chimu_01,psi_01) VADD(psi_02,Chimu_02,psi_02) \
VADD(psi_10,Chimu_10,psi_10) VADD(psi_11,Chimu_11,psi_11) VADD(psi_12,Chimu_12,psi_12) \
VADD(psi_20,Chimu_20,psi_20) VADD(psi_21,Chimu_21,psi_21) VADD(psi_22,Chimu_22,psi_22) \
VADD(psi_30,Chimu_30,psi_30) VADD(psi_31,Chimu_31,psi_31) VADD(psi_32,Chimu_32,psi_32) ); \
SAVE_RESULT(PTR,pf);
#define ADD_RESULTia(PTR,pf) \
LOAD64(%r8,PTR) \
__asm__ ( \
VADDMEM(0,%r8,psi_00,psi_00) \
VADDMEM(1,%r8,psi_01,psi_01) \
VADDMEM(2,%r8,psi_02,psi_02) \
VADDMEM(3,%r8,psi_10,psi_10) \
VADDMEM(4,%r8,psi_11,psi_11) \
VADDMEM(5,%r8,psi_12,psi_12) \
VADDMEM(6,%r8,psi_20,psi_20) \
VADDMEM(7,%r8,psi_21,psi_21) \
VADDMEM(8,%r8,psi_22,psi_22) \
VADDMEM(9,%r8,psi_30,psi_30) \
VADDMEM(10,%r8,psi_31,psi_31) \
VADDMEM(11,%r8,psi_32,psi_32) \
VSTORE(0,%r8,psi_00) \
VSTORE(1,%r8,psi_01) \
VSTORE(2,%r8,psi_02) \
VSTORE(3,%r8,psi_10) \
VSTORE(4,%r8,psi_11) \
VSTORE(5,%r8,psi_12) \
VSTORE(6,%r8,psi_20) \
VSTORE(7,%r8,psi_21) \
VSTORE(8,%r8,psi_22) \
VSTORE(9,%r8,psi_30) \
VSTORE(10,%r8,psi_31) \
VSTORE(11,%r8,psi_32) \
);
#ifdef AVX512_PF_L2_TABLE
#define PREFETCH_CHIMU(A) \
LOAD64(%r9,A) \
__asm__ ( \
VPREFETCH_P1(0,%r9) \
VPREFETCH_P1(1,%r9) \
VPREFETCH_P1(2,%r9) \
VPREFETCH_P1(3,%r9) \
VPREFETCH_P1(4,%r9) \
VPREFETCH_P1(5,%r9) \
VPREFETCH_P1(6,%r9) \
VPREFETCH_P1(7,%r9) \
VPREFETCH_P1(8,%r9) \
VPREFETCH_P1(9,%r9) \
VPREFETCH_P1(10,%r9) \
VPREFETCH_P1(11,%r9));
#else
#define PREFETCH_CHIMU(A)
#endif
#define PREFETCH1_CHIMU(A) \
LOAD64(%r9,A) \
__asm__ ( \
VPREFETCH_P1(0,%r9) \
VPREFETCH_P1(1,%r9) \
VPREFETCH_P1(2,%r9) \
VPREFETCH_P1(3,%r9) \
VPREFETCH_P1(4,%r9) \
VPREFETCH_P1(5,%r9) \
VPREFETCH_P1(6,%r9) \
VPREFETCH_P1(7,%r9) \
VPREFETCH_P1(8,%r9) \
VPREFETCH_P1(9,%r9) \
VPREFETCH_P1(10,%r9) \
VPREFETCH_P1(11,%r9));
#define PERMUTE_DIR0 __asm__ ( \
VPERM0(Chi_00,Chi_00) \
VPERM0(Chi_01,Chi_01) \
VPERM0(Chi_02,Chi_02) \
VPERM0(Chi_10,Chi_10) \
VPERM0(Chi_11,Chi_11) \
VPERM0(Chi_12,Chi_12) );
#define PERMUTE_DIR1 __asm__ ( \
VPERM1(Chi_00,Chi_00) \
VPERM1(Chi_01,Chi_01) \
VPERM1(Chi_02,Chi_02) \
VPERM1(Chi_10,Chi_10) \
VPERM1(Chi_11,Chi_11) \
VPERM1(Chi_12,Chi_12));
#define PERMUTE_DIR2 __asm__ ( \
VPERM2(Chi_00,Chi_00) \
VPERM2(Chi_01,Chi_01) \
VPERM2(Chi_02,Chi_02) \
VPERM2(Chi_10,Chi_10) \
VPERM2(Chi_11,Chi_11) \
VPERM2(Chi_12,Chi_12) );
#define PERMUTE_DIR3 __asm__ ( \
VPERM3(Chi_00,Chi_00) \
VPERM3(Chi_01,Chi_01) \
VPERM3(Chi_02,Chi_02) \
VPERM3(Chi_10,Chi_10) \
VPERM3(Chi_11,Chi_11) \
VPERM3(Chi_12,Chi_12) );
#define MULT_ADDSUB_2SPIN(ptr,pf) \
LOAD64(%r8,ptr) \
LOAD64(%r9,pf) \
__asm__ ( \
VPREFETCH_G2(9,%r8) \
VPREFETCH_G2(10,%r8) \
VPREFETCH_G2(11,%r8) \
VPREFETCH_G2(12,%r8) \
VPREFETCH_G2(13,%r8) \
VPREFETCH_G2(14,%r8) \
VPREFETCH_G2(15,%r8) \
VPREFETCH_G2(16,%r8) \
VPREFETCH_G2(17,%r8) \
VSHUF(Chi_00,T1) \
VMOVIDUP(0,%r8,Z0 ) \
VMOVIDUP(3,%r8,Z1 ) \
VMOVIDUP(6,%r8,Z2 ) VSHUF(Chi_10,T2) \
/*6*/ \
VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \
VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \
VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
VPREFETCH_M1(0,%r9) \
VPREFETCH_M1(1,%r9) \
VPREFETCH_M1(2,%r9) \
VPREFETCH_M1(3,%r9) \
/*18*/ \
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
VMADDSUB(Z3,Chi_10,UChi_10) \
VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \
VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
VMADDSUB(Z5,Chi_10,UChi_12) \
VPREFETCH_M1(4,%r9) \
VPREFETCH_M1(5,%r9) \
VPREFETCH_M1(6,%r9) \
VPREFETCH_M1(7,%r9) \
/*28*/ \
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
VMADDSUB(Z0,T2,UChi_10) \
VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \
VMADDSUB(Z1,T2,UChi_11) \
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
VPREFETCH2(12,%r9) \
VPREFETCH2(13,%r9) \
VPREFETCH2(14,%r9) \
VPREFETCH2(15,%r9) \
VPREFETCH2(16,%r9) \
VPREFETCH2(17,%r9) \
VPREFETCH2(18,%r9) \
VPREFETCH2(19,%r9) \
VPREFETCH2(20,%r9) \
VPREFETCH2(21,%r9) \
VPREFETCH2(22,%r9) \
VPREFETCH2(23,%r9) \
/*38*/ \
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
VMADDSUB(Z3,Chi_11,UChi_10) \
VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \
VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
VMADDSUB(Z5,Chi_11,UChi_12) \
VPREFETCH_M1(9,%r8) \
VPREFETCH_M1(10,%r8) \
VPREFETCH_M1(11,%r8) \
VPREFETCH_M1(12,%r8) \
VPREFETCH_M1(13,%r8) \
VPREFETCH_M1(14,%r8) \
VPREFETCH_M1(15,%r8) \
VPREFETCH_M1(16,%r8) \
VPREFETCH_M1(17,%r8) \
/*48*/ \
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
VMADDSUB(Z0,T2,UChi_10) \
VMADDSUB(Z1,T1,UChi_01) \
VMADDSUB(Z1,T2,UChi_11) \
VMADDSUB(Z2,T1,UChi_02) \
VMADDSUB(Z2,T2,UChi_12) \
VPREFETCH_M1(8,%r9) \
VPREFETCH_M1(9,%r9) \
VPREFETCH_M1(10,%r9) \
VPREFETCH_M1(11,%r9) \
/*55*/ \
VMADDSUB(Z3,Chi_02,UChi_00) \
VMADDSUB(Z3,Chi_12,UChi_10) \
VMADDSUB(Z4,Chi_02,UChi_01) \
VMADDSUB(Z4,Chi_12,UChi_11) \
VMADDSUB(Z5,Chi_02,UChi_02) \
VMADDSUB(Z5,Chi_12,UChi_12) \
/*61 insns*/ );
#define MULT_ADDSUB_2SPIN_LS(ptr,pf) \
LOAD64(%r8,ptr) \
LOAD64(%r9,pf) \
__asm__ ( \
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
VPREFETCH_M1(0,%r9) \
VPREFETCH_M1(1,%r9) \
VPREFETCH_M1(2,%r9) \
VPREFETCH_M1(3,%r9) \
/*8*/ \
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
VPREFETCH_M1(4,%r9) \
VPREFETCH_M1(5,%r9) \
VPREFETCH_M1(6,%r9) \
VPREFETCH_M1(7,%r9) \
/*16*/ \
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
VPREFETCH_M1(8,%r9) \
VPREFETCH_M1(9,%r9) \
VPREFETCH_M1(10,%r9) \
VPREFETCH_M1(11,%r9) \
/*22*/ \
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
VPREFETCH_M2(12,%r9) \
VPREFETCH_M2(13,%r9) \
VPREFETCH_M2(14,%r9) \
VPREFETCH_M2(15,%r9) \
/*30*/ \
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
VPREFETCH_M2(16,%r9) \
VPREFETCH_M2(17,%r9) \
VPREFETCH_M2(18,%r9) \
VPREFETCH_M2(19,%r9) \
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
/*36*/ \
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
VPREFETCH_M2(20,%r9) \
VPREFETCH_M2(21,%r9) \
VPREFETCH_M2(22,%r9) \
VPREFETCH_M2(23,%r9) \
VPREFETCH_G1(2,%r8) \
VPREFETCH_G1(3,%r8) \
VPREFETCH_G2(4,%r8) \
VPREFETCH_G2(5,%r8) \
VPREFETCH_G2(6,%r8) \
VPREFETCH_G2(7,%r8) \
/*42 insns*/ );
#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \
LOAD64(%r8,ptr) \
LOAD64(%r9,pf) \
__asm__ ( \
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
/*8*/ \
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
/*16*/ \
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
/*22*/ \
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
/*30*/ \
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
/*36*/ \
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
/* VPREFETCH1(2,%r8)*/ \
/* VPREFETCH1(3,%r8)*/ \
/*42 insns*/ );
#define Z6 Chi_00
#define MULT_ADDSUB_2SPIN_NEW(ptr,pf) \
LOAD64(%r8,ptr) \
__asm__ ( \
VSHUFMEM(0,%r8,Z0) \
VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \
VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \
VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \
VSHUFMEM(3,%r8,Z0) \
VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \
VSHUFMEM(6,%r8,Z0) \
VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \
VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \
VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \
VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \
/*11 cycles*/ \
VSHUFMEM(1,%r8,Z0) \
VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \
VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \
VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \
VSHUFMEM(4,%r8,Z0) \
VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \
VSHUFMEM(7,%r8,Z0) \
VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \
VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \
VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \
VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \
/*22 cycles*/ \
VSHUFMEM(2,%r8,Z0) \
VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \
VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \
VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \
VSHUFMEM(5,%r8,Z0) \
VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \
VSHUFMEM(8,%r8,Z0) \
VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \
/*33 cycles*/ \
VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \
VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \
VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \
/*stall*/ \
/*stall*/ \
/*stall*/ \
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) )
#endif

255
Grid/simd/Simd.h Normal file
View File

@ -0,0 +1,255 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Simd.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_SIMD_H
#define GRID_SIMD_H
////////////////////////////////////////////////////////////////////////
// Define scalar and vector floating point types
//
// Scalar: RealF, RealD, ComplexF, ComplexD
//
// Vector: vRealF, vRealD, vComplexF, vComplexD
//
// Vector types are arch dependent
////////////////////////////////////////////////////////////////////////
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)
#define RotateBit (0x100)
namespace Grid {
typedef uint32_t Integer;
typedef float RealF;
typedef double RealD;
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
typedef RealD Real;
#else
typedef RealF Real;
#endif
typedef std::complex<RealF> ComplexF;
typedef std::complex<RealD> ComplexD;
typedef std::complex<Real> Complex;
inline RealF adj(const RealF & r){ return r; }
inline RealF conjugate(const RealF & r){ return r; }
inline RealF real(const RealF & r){ return r; }
inline RealD adj(const RealD & r){ return r; }
inline RealD conjugate(const RealD & r){ return r; }
inline RealD real(const RealD & r){ return r; }
inline RealD sqrt(const RealD & r){ return std::sqrt(r); }
inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }
inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }
inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; }
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
inline ComplexD Reduce(const ComplexD& r){ return r; }
inline ComplexF Reduce(const ComplexF& r){ return r; }
inline RealD Reduce(const RealD& r){ return r; }
inline RealF Reduce(const RealF& r){ return r; }
inline RealD toReal(const ComplexD& r){ return real(r); }
inline RealF toReal(const ComplexF& r){ return real(r); }
inline RealD toReal(const RealD& r){ return r; }
inline RealF toReal(const RealF& r){ return r; }
////////////////////////////////////////////////////////////////////////////////
//Provide support functions for basic real and complex data types required by Grid
//Single and double precision versions. Should be able to template this once only.
////////////////////////////////////////////////////////////////////////////////
inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
// conjugate already supported for complex
inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
//conjugate already supported for complex
inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
// define projections to real and imaginay parts
inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
// define auxiliary functions for complex computations
inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);}
inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);}
inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}
inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);}
inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
inline void vstream(ComplexF &l, const ComplexF &r){ l=r;}
inline void vstream(ComplexD &l, const ComplexD &r){ l=r;}
inline void vstream(RealF &l, const RealF &r){ l=r;}
inline void vstream(RealD &l, const RealD &r){ l=r;}
class Zero{};
static Zero zero;
template<class itype> inline void zeroit(itype &arg){ arg=zero;};
template<> inline void zeroit(ComplexF &arg){ arg=0; };
template<> inline void zeroit(ComplexD &arg){ arg=0; };
template<> inline void zeroit(RealF &arg){ arg=0; };
template<> inline void zeroit(RealD &arg){ arg=0; };
//////////////////////////////////////////////////////////
// Permute
// Permute 0 every ABCDEFGH -> BA DC FE HG
// Permute 1 every ABCDEFGH -> CD AB GH EF
// Permute 2 every ABCDEFGH -> EFGH ABCD
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
// Permute 4 possible on half precision @512bit vectors.
//
// Defined inside SIMD specialization files
//////////////////////////////////////////////////////////
template<class VectorSIMD>
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm);
};
#include <Grid/simd/Grid_vector_types.h>
#include <Grid/simd/Grid_vector_unops.h>
namespace Grid {
// Default precision
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
typedef vRealD vReal;
typedef vComplexD vComplex;
#else
typedef vRealF vReal;
typedef vComplexF vComplex;
#endif
inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){
int nn=vComplexF::Nsimd();
std::vector<ComplexF,alignedAllocator<ComplexF> > buf(nn);
vstore(o,&buf[0]);
stream<<"<";
for(int i=0;i<nn;i++){
stream<<buf[i];
if(i<nn-1) stream<<",";
}
stream<<">";
return stream;
}
inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){
int nn=vComplexD::Nsimd();
std::vector<ComplexD,alignedAllocator<ComplexD> > buf(nn);
vstore(o,&buf[0]);
stream<<"<";
for(int i=0;i<nn;i++){
stream<<buf[i];
if(i<nn-1) stream<<",";
}
stream<<">";
return stream;
}
inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){
int nn=vRealF::Nsimd();
std::vector<RealF,alignedAllocator<RealF> > buf(nn);
vstore(o,&buf[0]);
stream<<"<";
for(int i=0;i<nn;i++){
stream<<buf[i];
if(i<nn-1) stream<<",";
}
stream<<">";
return stream;
}
inline std::ostream& operator<< (std::ostream& stream, const vRealD &o){
int nn=vRealD::Nsimd();
std::vector<RealD,alignedAllocator<RealD> > buf(nn);
vstore(o,&buf[0]);
stream<<"<";
for(int i=0;i<nn;i++){
stream<<buf[i];
if(i<nn-1) stream<<",";
}
stream<<">";
return stream;
}
inline std::ostream& operator<< (std::ostream& stream, const vInteger &o){
int nn=vInteger::Nsimd();
std::vector<Integer,alignedAllocator<Integer> > buf(nn);
vstore(o,&buf[0]);
stream<<"<";
for(int i=0;i<nn;i++){
stream<<buf[i];
if(i<nn-1) stream<<",";
}
stream<<">";
return stream;
}
}
#endif

37
Grid/simd/l1p.h Normal file
View File

@ -0,0 +1,37 @@
#pragma once
namespace Grid {
// L1p optimisation
inline void bgq_l1p_optimisation(int mode)
{
#ifdef QPX
#undef L1P_CFG_PF_USR
#define L1P_CFG_PF_USR (0x3fde8000108ll) /* (64 bit reg, 23 bits wide, user/unpriv) */
uint64_t cfg_pf_usr;
if ( mode ) {
cfg_pf_usr =
L1P_CFG_PF_USR_ifetch_depth(0)
| L1P_CFG_PF_USR_ifetch_max_footprint(1)
| L1P_CFG_PF_USR_pf_stream_est_on_dcbt
| L1P_CFG_PF_USR_pf_stream_establish_enable
| L1P_CFG_PF_USR_pf_stream_optimistic
| L1P_CFG_PF_USR_pf_adaptive_throttle(0xF) ;
// if ( sizeof(Float) == sizeof(double) ) {
cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(2)| L1P_CFG_PF_USR_dfetch_max_footprint(3) ;
// } else {
// cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(1)| L1P_CFG_PF_USR_dfetch_max_footprint(2) ;
// }
} else {
cfg_pf_usr = L1P_CFG_PF_USR_dfetch_depth(1)
| L1P_CFG_PF_USR_dfetch_max_footprint(2)
| L1P_CFG_PF_USR_ifetch_depth(0)
| L1P_CFG_PF_USR_ifetch_max_footprint(1)
| L1P_CFG_PF_USR_pf_stream_est_on_dcbt
| L1P_CFG_PF_USR_pf_stream_establish_enable
| L1P_CFG_PF_USR_pf_stream_optimistic
| L1P_CFG_PF_USR_pf_stream_prefetch_enable;
}
*((uint64_t *)L1P_CFG_PF_USR) = cfg_pf_usr;
#endif
}
}