mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-17 15:27:06 +01:00
Hadrons: moving Hadrons to root directory, build system improvements
This commit is contained in:
796
Grid/simd/BGQQPX.h
Normal file
796
Grid/simd/BGQQPX.h
Normal file
@ -0,0 +1,796 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/BGQQPX.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_BGQ_QPX_H
|
||||
#define GRID_ASM_BGQ_QPX_H
|
||||
|
||||
#include <stddint.h>
|
||||
|
||||
/*********************************************************
|
||||
* Architectural macros
|
||||
*********************************************************/
|
||||
#define VLOADf(OFF,PTR,DEST) "qvlfsux " #DEST "," #OFF "," #PTR ") ;\n"
|
||||
#define VLOADd(OFF,PTR,DEST) "qvlfdux " #DEST "," #OFF "," #PTR ") ;\n"
|
||||
#define VSTOREf(OFF,PTR,SRC) "qvstfsux " #SRC "," #OFF "," #PTR ") ;\n"
|
||||
#define VSTOREd(OFF,PTR,SRC) "qvstfdux " #SRC "," #OFF "," #PTR ") ;\n"
|
||||
#define VSPLATf(A,B,DEST) "qvlfcdxa " #A "," #B "," #DEST ";\n"
|
||||
#define VSPLATd(A,B,DEST) "qvlfcsxa " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define LOAD64(A,ptr)
|
||||
#define VZERO(DEST) "qvfclr " #DEST "; \n"
|
||||
#define VONE (DEST) "qvfset " #DEST "; \n"
|
||||
#define VNEG (SRC,DEST) "qvfneg " #DEST "," #SRC "; \n"
|
||||
#define VMOV(A,DEST) "qvfmr " #DEST, "," #A ";\n"
|
||||
|
||||
#define VADD(A,B,DEST) "qvfadd " #DEST "," #A "," #B ";\n"
|
||||
#define VSUB(A,B,DEST) "qvfsub " #DEST "," #A "," #B ";\n"
|
||||
#define VMUL(A,B,DEST) "qvfmul " #DEST "," #A "," #B ";\n"
|
||||
#define VMUL_RR_RI(A,B,DEST) "qvfxmul " #DEST "," #A "," #B ";\n"
|
||||
#define VMADD(A,B,C,DEST) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
#define VMADD_RR_RI(A,B,C,DEST) "qvfxmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
#define VMADD_MII_IR(A,B,C,DEST) "qvfxxnpmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
#define VMADD_II_MIR(A,B,C,DEST) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
|
||||
#define CACHE_LOCK (PTR) asm (" dcbtls %%r0, %0 \n" : : "r" (PTR) );
|
||||
#define CACHE_UNLOCK(PTR) asm (" dcblc %%r0, %0 \n" : : "r" (PTR) );
|
||||
#define CACHE_FLUSH (PTR) asm (" dcbf %%r0, %0 \n" : : "r" (PTR) );
|
||||
#define CACHE_TOUCH (PTR) asm (" dcbt %%r0, %0 \n" : : "r" (PTR) );
|
||||
|
||||
// Gauge field locking 2 x 9 complex == 18*8 / 16 bytes per link
|
||||
// This is 144/288 bytes == 4.5; 9 lines
|
||||
#define MASK_REGS /*NOOP ON BGQ*/
|
||||
#define PF_GAUGE(A) /*NOOP ON BGQ*/
|
||||
#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
|
||||
#define PREFETCH_CHIMU(base) /*NOOP ON BGQ*/
|
||||
|
||||
/*********************************************************
|
||||
* Register definitions
|
||||
*********************************************************/
|
||||
#define psi_00 0
|
||||
#define psi_01 1
|
||||
#define psi_02 2
|
||||
|
||||
#define psi_10 3
|
||||
#define psi_11 4
|
||||
#define psi_12 5
|
||||
|
||||
#define psi_20 6
|
||||
#define psi_21 7
|
||||
#define psi_22 8
|
||||
|
||||
#define psi_30 9
|
||||
#define psi_31 10
|
||||
#define psi_32 11
|
||||
|
||||
#define Chi_00 12
|
||||
#define Chi_01 13
|
||||
#define Chi_02 14
|
||||
|
||||
#define Chi_10 15
|
||||
#define Chi_11 16
|
||||
#define Chi_12 17
|
||||
|
||||
#define UChi_00 18
|
||||
#define UChi_01 19
|
||||
#define UChi_02 20
|
||||
|
||||
#define UChi_10 21
|
||||
#define UChi_11 22
|
||||
#define UChi_12 23
|
||||
|
||||
#define U0 24
|
||||
#define U1 25
|
||||
#define U2 26
|
||||
#define one 27
|
||||
|
||||
#define REP %%r16
|
||||
#define IMM %%r17
|
||||
|
||||
/*Alias regs*/
|
||||
#define Chimu_00 Chi_00
|
||||
#define Chimu_01 Chi_01
|
||||
#define Chimu_02 Chi_02
|
||||
#define Chimu_10 Chi_10
|
||||
#define Chimu_11 Chi_11
|
||||
#define Chimu_12 Chi_02
|
||||
#define Chimu_20 UChi_00
|
||||
#define Chimu_21 UChi_01
|
||||
#define Chimu_22 UChi_02
|
||||
#define Chimu_30 UChi_10
|
||||
#define Chimu_31 UChi_11
|
||||
#define Chimu_32 UChi_02
|
||||
|
||||
/*********************************************************
|
||||
* Macro sequences encoding QCD
|
||||
*********************************************************/
|
||||
#define LOCK_GAUGE(dir) \
|
||||
{ \
|
||||
uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \
|
||||
for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \
|
||||
CACHE_LOCK(&byte_addr[i]); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UNLOCK_GAUGE(dir) \
|
||||
{ \
|
||||
uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \
|
||||
for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \
|
||||
CACHE_UNLOCK(&byte_addr[i]); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define MAYBEPERM(A,B)
|
||||
|
||||
#define PERMUTE_DIR3
|
||||
#define PERMUTE_DIR2
|
||||
#define PERMUTE_DIR1
|
||||
#define PERMUTE_DIR0
|
||||
|
||||
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
|
||||
#define MULT_SPIN(ptr,p) { \
|
||||
uint64_t ub = ((uint64_t)base); \
|
||||
asm ( \
|
||||
VLOAD(%0,%3,U0) \
|
||||
VLOAD(%1,%3,U1) \
|
||||
VLOAD(%2,%3,U2) \
|
||||
VMUL_RR_RI(U0,Chi_00,UChi_00) \
|
||||
VMUL_RR_RI(U1,Chi_00,UChi_01) \
|
||||
VMUL_RR_RI(U2,Chi_00,UChi_02) \
|
||||
VMUL_RR_RI(U0,Chi_10,UChi_10) \
|
||||
VMUL_RR_RI(U1,Chi_10,UChi_11) \
|
||||
VMUL_RR_RI(U2,Chi_10,UChi_12) \
|
||||
VMADD_MII_IR(U0,Chi_00,UChi_00,UChi_00) \
|
||||
VMADD_MII_IR(U1,Chi_00,UChi_01,UChi_01) \
|
||||
VMADD_MII_IR(U2,Chi_00,UChi_02,UChi_02) \
|
||||
VMADD_MII_IR(U0,Chi_10,UChi_10,UChi_10) \
|
||||
VMADD_MII_IR(U1,Chi_10,UChi_11,UChi_11) \
|
||||
VMADD_MII_IR(U2,Chi_10,UChi_12,UChi_12) \
|
||||
: : "r" (0), "r" (32*3), "r" (32*6), "r" (ub )); \
|
||||
asm ( \
|
||||
VLOAD(%0,%3,U0) \
|
||||
VLOAD(%1,%3,U1) \
|
||||
VLOAD(%2,%3,U2) \
|
||||
VMADD_RR_RI(U0,Chi_01,UChi_00,UChi_00) \
|
||||
VMADD_RR_RI(U1,Chi_01,UChi_01,UChi_01) \
|
||||
VMADD_RR_RI(U2,Chi_01,UChi_02,UChi_02) \
|
||||
VMADD_RR_RI(U0,Chi_11,UChi_10,UChi_10) \
|
||||
VMADD_RR_RI(U1,Chi_11,UChi_11,UChi_11) \
|
||||
VMADD_RR_RI(U2,Chi_11,UChi_12,UChi_12) \
|
||||
VMADD_MII_IR(U0,Chi_01,UChi_00,UChi_00) \
|
||||
VMADD_MII_IR(U1,Chi_01,UChi_01,UChi_01) \
|
||||
VMADD_MII_IR(U2,Chi_01,UChi_02,UChi_02) \
|
||||
VMADD_MII_IR(U0,Chi_11,UChi_10,UChi_10) \
|
||||
VMADD_MII_IR(U1,Chi_11,UChi_11,UChi_11) \
|
||||
VMADD_MII_IR(U2,Chi_11,UChi_12,UChi_12) \
|
||||
: : "r" (32), "r" (32*4), "r" (32*7), "r" (ub )); \
|
||||
asm ( \
|
||||
VLOAD(%0,%3,U0) \
|
||||
VLOAD(%1,%3,U1) \
|
||||
VLOAD(%2,%3,U2) \
|
||||
VMADD_RR_RI(U0,Chi_02,UChi_00,UChi_00) \
|
||||
VMADD_RR_RI(U1,Chi_02,UChi_01,UChi_01) \
|
||||
VMADD_RR_RI(U2,Chi_02,UChi_02,UChi_02) \
|
||||
VMADD_RR_RI(U0,Chi_12,UChi_10,UChi_10) \
|
||||
VMADD_RR_RI(U1,Chi_12,UChi_11,UChi_11) \
|
||||
VMADD_RR_RI(U2,Chi_12,UChi_12,UChi_12) \
|
||||
VMADD_MII_IR(U0,Chi_02,UChi_00,UChi_00) \
|
||||
VMADD_MII_IR(U1,Chi_02,UChi_01,UChi_01) \
|
||||
VMADD_MII_IR(U2,Chi_02,UChi_02,UChi_02) \
|
||||
VMADD_MII_IR(U0,Chi_12,UChi_10,UChi_10) \
|
||||
VMADD_MII_IR(U1,Chi_12,UChi_11,UChi_11) \
|
||||
VMADD_MII_IR(U2,Chi_12,UChi_12,UChi_12) \
|
||||
: : "r" (32*2), "r" (32*5), "r" (32*8), "r" (ub )); \
|
||||
}
|
||||
|
||||
#define SAVE_RESULT(base,basep) {\
|
||||
uint64_t ub = ((uint64_t)base) - 32; \
|
||||
asm("mr %0,"REP";\n\t" \
|
||||
"li " IMM ",32;\n\t" \
|
||||
VSTORE(IMM,REP,psi_00) \
|
||||
VSTORE(IMM,REP,psi_01) \
|
||||
VSTORE(IMM,REP,psi_02) \
|
||||
VSTORE(IMM,REP,psi_10) \
|
||||
VSTORE(IMM,REP,psi_11) \
|
||||
VSTORE(IMM,REP,psi_12) \
|
||||
VSTORE(IMM,REP,psi_20) \
|
||||
VSTORE(IMM,REP,psi_21) \
|
||||
VSTORE(IMM,REP,psi_22) \
|
||||
VSTORE(IMM,REP,psi_30) \
|
||||
VSTORE(IMM,REP,psi_31) \
|
||||
VSTORE(IMM,REP,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
/*
|
||||
*Annoying BG/Q loads with no immediat indexing and big performance hit
|
||||
*when second miss to a L1 line occurs
|
||||
*/
|
||||
#define LOAD_CHI(base) { \
|
||||
uint64_t ub = ((uint64_t)base) - 64; \
|
||||
asm("mr %0,"REP";\n\t" \
|
||||
"li " IMM ",64;\n\t" \
|
||||
VLOAD(IMM,REP,Chi_00) \
|
||||
VLOAD(IMM,REP,Chi_02) \
|
||||
VLOAD(IMM,REP,Chi_11) : : "r" (ub) ); \
|
||||
ub = ((uint64_t)base) - 32; \
|
||||
asm("mr %0,"REP";\n\t" \
|
||||
"li IMM,64;\n\t" \
|
||||
VLOAD(IMM,REP,Chimu_01) \
|
||||
VLOAD(IMM,REP,Chimu_10) \
|
||||
VLOAD(IMM,REP,Chimu_12) : : "r" (ub) ); \
|
||||
}
|
||||
|
||||
#define LOAD_CHIMU(base) { \
|
||||
uint64_t ub = ((uint64_t)base) - 64; \
|
||||
asm("mr %0,"REP";\n\t" \
|
||||
"li IMM,64;\n\t" \
|
||||
VLOAD(IMM,REP,Chimu_00) \
|
||||
VLOAD(IMM,REP,Chimu_02) \
|
||||
VLOAD(IMM,REP,Chimu_11) \
|
||||
VLOAD(IMM,REP,Chimu_20) \
|
||||
VLOAD(IMM,REP,Chimu_22) \
|
||||
VLOAD(IMM,REP,Chimu_31) : : "r" (ub) ); \
|
||||
ub = ((uint64_t)base) - 32; \
|
||||
asm("mr %0,"REP";\n\t" \
|
||||
"li IMM,64;\n\t" \
|
||||
VLOAD(IMM,REP,Chimu_01) \
|
||||
VLOAD(IMM,REP,Chimu_10) \
|
||||
VLOAD(IMM,REP,Chimu_12) \
|
||||
VLOAD(IMM,REP,Chimu_21) \
|
||||
VLOAD(IMM,REP,Chimu_30) \
|
||||
VLOAD(IMM,REP,Chimu_32) : : "r" (ub) ); \
|
||||
}
|
||||
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
#define XP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_MII_IR(one,Chimu_30,Chimu_00,Chi_00) \
|
||||
VMADD_MII_IR(one,Chimu_31,Chimu_01,Chi_01) \
|
||||
VMADD_MII_IR(one,Chimu_32,Chimu_02,Chi_02) \
|
||||
VMADD_MII_IR(one,Chimu_20,Chimu_10,Chi_10) \
|
||||
VMADD_MII_IR(one,Chimu_21,Chimu_11,Chi_11) \
|
||||
VMADD_MII_IR(one,Chimu_22,Chimu_12,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_II_MIR(one,Chimu_30,Chimu_00,Chi_00) \
|
||||
VMADD_II_MIR(one,Chimu_31,Chimu_01,Chi_01) \
|
||||
VMADD_II_MIR(one,Chimu_32,Chimu_02,Chi_02) \
|
||||
VMADD_II_MIR(one,Chimu_20,Chimu_10,Chi_10) \
|
||||
VMADD_II_MIR(one,Chimu_21,Chimu_11,Chi_11) \
|
||||
VMADD_II_MIR(one,Chimu_22,Chimu_12,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
// hspin(0)=fspin(0)-fspin(3);
|
||||
// hspin(1)=fspin(1)+fspin(2);
|
||||
#define YP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VSUB(Chimu_00,Chimu_00,Chi_30) \
|
||||
VSUB(Chimu_01,Chimu_01,Chi_31) \
|
||||
VSUB(Chimu_02,Chimu_02,Chi_32) \
|
||||
VADD(Chimu_10,Chimu_10,Chi_20) \
|
||||
VADD(Chimu_11,Chimu_11,Chi_21) \
|
||||
VADD(Chimu_12,Chimu_12,Chi_22) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define YM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VADD(Chimu_00,Chimu_00,Chi_30) \
|
||||
VADD(Chimu_01,Chimu_01,Chi_31) \
|
||||
VADD(Chimu_02,Chimu_02,Chi_32) \
|
||||
VSUB(Chimu_10,Chimu_10,Chi_20) \
|
||||
VSUB(Chimu_11,Chimu_11,Chi_21) \
|
||||
VSUB(Chimu_12,Chimu_12,Chi_22) \
|
||||
); \
|
||||
}
|
||||
|
||||
/*Gz
|
||||
* 0 0 i 0 [0]+-i[2]
|
||||
* 0 0 0 -i [1]-+i[3]
|
||||
* -i 0 0 0
|
||||
* 0 i 0 0
|
||||
*/
|
||||
#define ZP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_MII_IR(one,Chimu_20,Chimu_00,Chi_00) \
|
||||
VMADD_MII_IR(one,Chimu_21,Chimu_01,Chi_01) \
|
||||
VMADD_MII_IR(one,Chimu_22,Chimu_02,Chi_02) \
|
||||
VMADD_II_MIR(one,Chimu_30,Chimu_10,Chi_10) \
|
||||
VMADD_II_MIR(one,Chimu_31,Chimu_11,Chi_11) \
|
||||
VMADD_II_MIR(one,Chimu_32,Chimu_12,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define ZM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_II_MIR(one,Chimu_20,Chimu_00,Chi_00) \
|
||||
VMADD_II_MIR(one,Chimu_21,Chimu_01,Chi_01) \
|
||||
VMADD_II_MIR(one,Chimu_22,Chimu_02,Chi_02) \
|
||||
VMADD_MII_IR(one,Chimu_30,Chimu_10,Chi_10) \
|
||||
VMADD_MII_IR(one,Chimu_31,Chimu_11,Chi_11) \
|
||||
VMADD_MII_IR(one,Chimu_32,Chimu_12,Chi_12) \
|
||||
); \
|
||||
}
|
||||
/*Gt
|
||||
* 0 0 1 0 [0]+-[2]
|
||||
* 0 0 0 1 [1]+-[3]
|
||||
* 1 0 0 0
|
||||
* 0 1 0 0
|
||||
*/
|
||||
#define TP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VADD(Chimu_00,Chimu_00,Chi_20) \
|
||||
VADD(Chimu_01,Chimu_01,Chi_21) \
|
||||
VADD(Chimu_02,Chimu_02,Chi_22) \
|
||||
VADD(Chimu_10,Chimu_10,Chi_30) \
|
||||
VADD(Chimu_11,Chimu_11,Chi_31) \
|
||||
VADD(Chimu_12,Chimu_12,Chi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define TM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VSUB(Chimu_00,Chimu_00,Chi_20) \
|
||||
VSUB(Chimu_01,Chimu_01,Chi_21) \
|
||||
VSUB(Chimu_02,Chimu_02,Chi_22) \
|
||||
VSUB(Chimu_10,Chimu_10,Chi_30) \
|
||||
VSUB(Chimu_11,Chimu_11,Chi_31) \
|
||||
VSUB(Chimu_12,Chimu_12,Chi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
/*
|
||||
fspin(0)=hspin(0);
|
||||
fspin(1)=hspin(1);
|
||||
fspin(2)=timesMinusI(hspin(1));
|
||||
fspin(3)=timesMinusI(hspin(0));
|
||||
|
||||
fspin(0)+=hspin(0);
|
||||
fspin(1)+=hspin(1);
|
||||
fspin(2)-=timesI(hspin(1));
|
||||
fspin(3)-=timesI(hspin(0));
|
||||
*/
|
||||
#define XP_RECON { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
|
||||
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
|
||||
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
|
||||
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
|
||||
VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \
|
||||
VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \
|
||||
VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \
|
||||
VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \
|
||||
VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \
|
||||
VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_RECON { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
|
||||
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
|
||||
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
|
||||
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
|
||||
VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \
|
||||
VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \
|
||||
VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \
|
||||
VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \
|
||||
VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \
|
||||
VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XP_RECON_ACCUM { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \
|
||||
VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \
|
||||
VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \
|
||||
VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \
|
||||
VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \
|
||||
VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_RECON_ACCUM { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \
|
||||
VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \
|
||||
VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \
|
||||
VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \
|
||||
VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \
|
||||
VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
// fspin(2)+=hspin(1);
|
||||
// fspin(3)-=hspin(0);
|
||||
#define YP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VADD(psi_20,UChi_10,psi_20) VADD(psi_21,UChi_11,psi_21) VADD(psi_22,UChi_12,psi_22) \
|
||||
VSUB(psi_30,UChi_00,psi_30) VSUB(psi_31,UChi_01,psi_31) VSUB(psi_32,UChi_02,psi_32) \
|
||||
);\
|
||||
}
|
||||
#define YM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VSUB(psi_20,UChi_10,psi_20) VSUB(psi_21,UChi_11,psi_21) VSUB(psi_22,UChi_12,psi_22) \
|
||||
VADD(psi_30,UChi_00,psi_30) VADD(psi_31,UChi_01,psi_31) VADD(psi_32,UChi_02,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
// fspin(2)-=timesI(hspin(0));
|
||||
// fspin(3)+=timesI(hspin(1));
|
||||
#define ZP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VMADD_II_MIR(one,UChi_00,psi_20,psi_20) \
|
||||
VMADD_II_MIR(one,UChi_01,psi_21,psi_21) \
|
||||
VMADD_II_MIR(one,UChi_02,psi_22,psi_22) \
|
||||
VMADD_MII_IR(one,UChi_10,psi_30,psi_30) \
|
||||
VMADD_MII_IR(one,UChi_11,psi_31,psi_31) \
|
||||
VMADD_MII_IR(one,UChi_12,psi_32,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
#define ZM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VMADD_MII_IR(one,UChi_00,psi_20,psi_20) \
|
||||
VMADD_MII_IR(one,UChi_01,psi_21,psi_21) \
|
||||
VMADD_MII_IR(one,UChi_02,psi_22,psi_22) \
|
||||
VMADD_II_MIR(one,UChi_10,psi_30,psi_30) \
|
||||
VMADD_II_MIR(one,UChi_11,psi_31,psi_31) \
|
||||
VMADD_II_MIR(one,UChi_12,psi_32,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
// fspin(2)+=hspin(0);
|
||||
// fspin(3)+=hspin(1);
|
||||
#define TP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VADD(psi_20,UChi_00,psi_20) VADD(psi_21,UChi_01,psi_21) VADD(psi_22,UChi_02,psi_22) \
|
||||
VADD(psi_30,UChi_10,psi_30) VADD(psi_31,UChi_11,psi_31) VADD(psi_32,UChi_12,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
#define TM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VSUB(psi_20,UChi_00,psi_20) VSUB(psi_21,UChi_01,psi_21) VSUB(psi_22,UChi_02,psi_22) \
|
||||
VSUB(psi_30,UChi_10,psi_30) VSUB(psi_31,UChi_11,psi_31) VSUB(psi_32,UChi_12,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
uint64_t GetPFInfo(int nent,int plocal);
|
||||
uint64_t GetInfo(int ptype,int local,int perm,int Xp,int ent,int plocal);
|
||||
|
||||
#define COMPLEX_TYPE int;
|
||||
int signs[4];
|
||||
|
||||
void testme(int osites,int ssU)
|
||||
{
|
||||
int local,perm, ptype;
|
||||
uint64_t base;
|
||||
uint64_t basep;
|
||||
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||
|
||||
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||
//COMPLEX_TYPE is vComplexF of vComplexD depending
|
||||
//on the chosen precision
|
||||
COMPLEX_TYPE *isigns = &signs[0];
|
||||
|
||||
MASK_REGS;
|
||||
int nmax=osites;
|
||||
for(int site=0;site<Ns;site++) {
|
||||
int sU =ssU;
|
||||
int ssn=ssU+1;
|
||||
if(ssn>=nmax) ssn=0;
|
||||
int sUn=ssn;
|
||||
for(int s=0;s<Ls;s++) {
|
||||
ss =sU*Ls+s;
|
||||
ssn=sUn*Ls+s;
|
||||
////////////////////////////////
|
||||
// Xp
|
||||
////////////////////////////////
|
||||
int ent=ss*8;// 2*Ndim
|
||||
int nent=ssn*8;
|
||||
|
||||
PF_GAUGE(Xp);
|
||||
base = GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
|
||||
PREFETCH1_CHIMU(base);
|
||||
|
||||
basep = GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns);
|
||||
#ifdef KERNEL_DAG
|
||||
XP_PROJMEM(base);
|
||||
#else
|
||||
XM_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR3,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Yp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXP(Xp,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns);
|
||||
#ifdef KERNEL_DAG
|
||||
XP_RECON;
|
||||
#else
|
||||
XM_RECON;
|
||||
#endif
|
||||
////////////////////////////////
|
||||
// Yp
|
||||
////////////////////////////////
|
||||
basep = GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
YP_PROJMEM(base);
|
||||
#else
|
||||
YM_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR2,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Zp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYP(Yp,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
YP_RECON_ACCUM;
|
||||
#else
|
||||
YM_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
////////////////////////////////
|
||||
// Zp
|
||||
////////////////////////////////
|
||||
basep = GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
ZP_PROJMEM(base);
|
||||
#else
|
||||
ZM_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR1,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Tp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZP(Zp,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
ZP_RECON_ACCUM;
|
||||
#else
|
||||
ZM_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
////////////////////////////////
|
||||
// Tp
|
||||
////////////////////////////////
|
||||
basep = GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
TP_PROJMEM(base);
|
||||
#else
|
||||
TM_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR0,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Xm,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTP(Tp,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
TP_RECON_ACCUM;
|
||||
#else
|
||||
TM_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
////////////////////////////////
|
||||
// Xm
|
||||
////////////////////////////////
|
||||
#ifndef STREAM_STORE
|
||||
basep= (uint64_t) &out._odata[ss];
|
||||
#endif
|
||||
// basep= GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
XM_PROJMEM(base);
|
||||
#else
|
||||
XP_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR3,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Ym,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXM(Xm,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
XM_RECON_ACCUM;
|
||||
#else
|
||||
XP_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
////////////////////////////////
|
||||
// Ym
|
||||
////////////////////////////////
|
||||
basep= GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
YM_PROJMEM(base);
|
||||
#else
|
||||
YP_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR2,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Zm,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYM(Ym,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
YM_RECON_ACCUM;
|
||||
#else
|
||||
YP_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
////////////////////////////////
|
||||
// Zm
|
||||
////////////////////////////////
|
||||
basep= GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
ZM_PROJMEM(base);
|
||||
#else
|
||||
ZP_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR1,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Tm,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZM(Zm,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
ZM_RECON_ACCUM;
|
||||
#else
|
||||
ZP_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
////////////////////////////////
|
||||
// Tm
|
||||
////////////////////////////////
|
||||
basep= GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
TM_PROJMEM(base);
|
||||
#else
|
||||
TP_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR0,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base= (uint64_t) &out._odata[ss];
|
||||
#ifndef STREAM_STORE
|
||||
PREFETCH_CHIMU(base);
|
||||
#endif
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTM(Tm,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
TM_RECON_ACCUM;
|
||||
#else
|
||||
TP_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
basep= GetPFInfo(nent,plocal); nent++;
|
||||
SAVE_RESULT(base,basep);
|
||||
|
||||
}
|
||||
ssU++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
769
Grid/simd/Grid_avx.h
Normal file
769
Grid/simd/Grid_avx.h
Normal file
@ -0,0 +1,769 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Grid_avx.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: neo <cossu@post.kek.jp>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <immintrin.h>
|
||||
#ifdef AVXFMA4
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
|
||||
#ifndef _mm256_set_m128i
|
||||
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
|
||||
#endif
|
||||
|
||||
namespace Grid {
|
||||
namespace Optimization {
|
||||
|
||||
template<class vtype>
|
||||
union uconv {
|
||||
__m256 f;
|
||||
vtype v;
|
||||
};
|
||||
|
||||
union u256f {
|
||||
__m256 v;
|
||||
float f[8];
|
||||
};
|
||||
|
||||
union u256d {
|
||||
__m256d v;
|
||||
double f[4];
|
||||
};
|
||||
|
||||
struct Vsplat{
|
||||
// Complex float
|
||||
inline __m256 operator()(float a, float b) {
|
||||
return _mm256_set_ps(b,a,b,a,b,a,b,a);
|
||||
}
|
||||
// Real float
|
||||
inline __m256 operator()(float a){
|
||||
return _mm256_set_ps(a,a,a,a,a,a,a,a);
|
||||
}
|
||||
//Complex double
|
||||
inline __m256d operator()(double a, double b){
|
||||
return _mm256_set_pd(b,a,b,a);
|
||||
}
|
||||
//Real double
|
||||
inline __m256d operator()(double a){
|
||||
return _mm256_set_pd(a,a,a,a);
|
||||
}
|
||||
//Integer
|
||||
inline __m256i operator()(Integer a){
|
||||
return _mm256_set1_epi32(a);
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(__m256 a, float* F){
|
||||
_mm256_store_ps(F,a);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(__m256d a, double* D){
|
||||
_mm256_store_pd(D,a);
|
||||
}
|
||||
//Integer
|
||||
inline void operator()(__m256i a, Integer* I){
|
||||
_mm256_store_si256((__m256i*)I,a);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct Vstream{
|
||||
//Float
|
||||
inline void operator()(float * a, __m256 b){
|
||||
_mm256_stream_ps(a,b);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(double * a, __m256d b){
|
||||
_mm256_stream_pd(a,b);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
struct Vset{
|
||||
// Complex float
|
||||
inline __m256 operator()(Grid::ComplexF *a){
|
||||
return _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
}
|
||||
// Complex double
|
||||
inline __m256d operator()(Grid::ComplexD *a){
|
||||
return _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
}
|
||||
// Real float
|
||||
inline __m256 operator()(float *a){
|
||||
return _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
// Real double
|
||||
inline __m256d operator()(double *a){
|
||||
return _mm256_set_pd(a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
// Integer
|
||||
inline __m256i operator()(Integer *a){
|
||||
return _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
// Need templated class to overload output type
|
||||
// General form must generate error if compiled
|
||||
inline Out_type operator()(In_type in){
|
||||
printf("Error, using wrong Reduce function\n");
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
return _mm256_add_ps(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline __m256d operator()(__m256d a, __m256d b){
|
||||
return _mm256_add_pd(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline __m256i operator()(__m256i a, __m256i b){
|
||||
#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
|
||||
__m128i a0,a1;
|
||||
__m128i b0,b1;
|
||||
a0 = _mm256_extractf128_si256(a,0);
|
||||
b0 = _mm256_extractf128_si256(b,0);
|
||||
a1 = _mm256_extractf128_si256(a,1);
|
||||
b1 = _mm256_extractf128_si256(b,1);
|
||||
a0 = _mm_add_epi32(a0,b0);
|
||||
a1 = _mm_add_epi32(a1,b1);
|
||||
return _mm256_set_m128i(a1,a0);
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
return _mm256_add_epi32(a,b);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
return _mm256_sub_ps(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline __m256d operator()(__m256d a, __m256d b){
|
||||
return _mm256_sub_pd(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline __m256i operator()(__m256i a, __m256i b){
|
||||
#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
|
||||
__m128i a0,a1;
|
||||
__m128i b0,b1;
|
||||
a0 = _mm256_extractf128_si256(a,0);
|
||||
b0 = _mm256_extractf128_si256(b,0);
|
||||
a1 = _mm256_extractf128_si256(a,1);
|
||||
b1 = _mm256_extractf128_si256(b,1);
|
||||
a0 = _mm_sub_epi32(a0,b0);
|
||||
a1 = _mm_sub_epi32(a1,b1);
|
||||
return _mm256_set_m128i(a1,a0);
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
return _mm256_sub_epi32(a,b);
|
||||
#endif
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
struct MultRealPart{
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
__m256 ymm0;
|
||||
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||
return _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
}
|
||||
inline __m256d operator()(__m256d a, __m256d b){
|
||||
__m256d ymm0;
|
||||
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
|
||||
return _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
}
|
||||
};
|
||||
struct MaddRealPart{
|
||||
inline __m256 operator()(__m256 a, __m256 b, __m256 c){
|
||||
__m256 ymm0 = _mm256_moveldup_ps(a); // ymm0 <- ar ar,
|
||||
return _mm256_add_ps(_mm256_mul_ps( ymm0, b),c);
|
||||
}
|
||||
inline __m256d operator()(__m256d a, __m256d b, __m256d c){
|
||||
__m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 );
|
||||
return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c);
|
||||
}
|
||||
};
|
||||
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
#if defined (AVX1)
|
||||
__m256 ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
// FIXME AVX2 could MAC
|
||||
ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
return _mm256_addsub_ps(ymm0,ymm1);
|
||||
#endif
|
||||
#if defined (AVXFMA4)
|
||||
__m256 a_real = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ar ar,
|
||||
__m256 a_imag = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ai ai
|
||||
__m256 tmp = _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||
return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||
#endif
|
||||
#if defined (AVX2) || defined (AVXFMA)
|
||||
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
|
||||
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
|
||||
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||
return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||
#endif
|
||||
}
|
||||
// Complex double
|
||||
inline __m256d operator()(__m256d a, __m256d b) {
|
||||
// Multiplication of (ak+ibk)*(ck+idk)
|
||||
// a + i b can be stored as a data structure
|
||||
// From intel optimisation reference guide
|
||||
/*
|
||||
movsldup xmm0, Src1; load real parts into the destination,
|
||||
; a1, a1, a0, a0
|
||||
movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
|
||||
mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
|
||||
shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
|
||||
movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
|
||||
mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
|
||||
addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
|
||||
VSHUFPD (VEX.256 encoded version)
|
||||
IF IMM0[0] = 0
|
||||
THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
|
||||
IF IMM0[1] = 0
|
||||
THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
|
||||
IF IMM0[2] = 0
|
||||
THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
|
||||
IF IMM0[3] = 0
|
||||
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged
|
||||
*/
|
||||
#if defined (AVX1)
|
||||
__m256d ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
|
||||
ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
ymm1 = _mm256_shuffle_pd(b,b,0x5); // ymm1 <- br,bi b'01,01
|
||||
ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai b'11,11
|
||||
ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
return _mm256_addsub_pd(ymm0,ymm1);
|
||||
#endif
|
||||
#if defined (AVXFMA4)
|
||||
__m256d a_real = _mm256_shuffle_pd(a,a,0x0);//arar
|
||||
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
|
||||
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||
#endif
|
||||
#if defined (AVX2) || defined (AVXFMA)
|
||||
__m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
|
||||
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
|
||||
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||
return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
#if 0
|
||||
struct ComplexDot {
|
||||
|
||||
inline void Prep(__m256 ari,__m256 &air) {
|
||||
cdotRIperm(ari,air);
|
||||
}
|
||||
inline void Mul(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
|
||||
riir=air*b;
|
||||
iirr=arr*b;
|
||||
};
|
||||
inline void Madd(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
|
||||
mac(riir,air,b);
|
||||
mac(iirr,ari,b);
|
||||
}
|
||||
inline void End(__m256 ari,__m256 &air) {
|
||||
// cdotRI
|
||||
}
|
||||
|
||||
};
|
||||
#endif
|
||||
|
||||
struct Mult{
|
||||
|
||||
inline void mac(__m256 &a, __m256 b, __m256 c){
|
||||
#if defined (AVX1)
|
||||
a= _mm256_add_ps(_mm256_mul_ps(b,c),a);
|
||||
#endif
|
||||
#if defined (AVXFMA4)
|
||||
a= _mm256_macc_ps(b,c,a);
|
||||
#endif
|
||||
#if defined (AVX2) || defined (AVXFMA)
|
||||
a= _mm256_fmadd_ps( b, c, a);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void mac(__m256d &a, __m256d b, __m256d c){
|
||||
#if defined (AVX1)
|
||||
a= _mm256_add_pd(_mm256_mul_pd(b,c),a);
|
||||
#endif
|
||||
#if defined (AVXFMA4)
|
||||
a= _mm256_macc_pd(b,c,a);
|
||||
#endif
|
||||
#if defined (AVX2) || defined (AVXFMA)
|
||||
a= _mm256_fmadd_pd( b, c, a);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Real float
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
return _mm256_mul_ps(a,b);
|
||||
}
|
||||
// Real double
|
||||
inline __m256d operator()(__m256d a, __m256d b){
|
||||
return _mm256_mul_pd(a,b);
|
||||
}
|
||||
// Integer
|
||||
inline __m256i operator()(__m256i a, __m256i b){
|
||||
#if defined (AVX1) || defined (AVXFMA)
|
||||
__m128i a0,a1;
|
||||
__m128i b0,b1;
|
||||
a0 = _mm256_extractf128_si256(a,0);
|
||||
b0 = _mm256_extractf128_si256(b,0);
|
||||
a1 = _mm256_extractf128_si256(a,1);
|
||||
b1 = _mm256_extractf128_si256(b,1);
|
||||
a0 = _mm_mullo_epi32(a0,b0);
|
||||
a1 = _mm_mullo_epi32(a1,b1);
|
||||
return _mm256_set_m128i(a1,a0);
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
return _mm256_mullo_epi32(a,b);
|
||||
#endif
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
struct Div {
|
||||
// Real float
|
||||
inline __m256 operator()(__m256 a, __m256 b) {
|
||||
return _mm256_div_ps(a, b);
|
||||
}
|
||||
// Real double
|
||||
inline __m256d operator()(__m256d a, __m256d b){
|
||||
return _mm256_div_pd(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct Conj{
|
||||
// Complex single
|
||||
inline __m256 operator()(__m256 in){
|
||||
return _mm256_xor_ps(_mm256_addsub_ps(_mm256_setzero_ps(),in), _mm256_set1_ps(-0.f));
|
||||
}
|
||||
// Complex double
|
||||
inline __m256d operator()(__m256d in){
|
||||
return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline __m256 operator()(__m256 in, __m256 ret){
|
||||
__m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i
|
||||
return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
|
||||
}
|
||||
//Complex double
|
||||
inline __m256d operator()(__m256d in, __m256d ret){
|
||||
__m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i
|
||||
return _mm256_shuffle_pd(tmp,tmp,0x5);
|
||||
}
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m256 operator()(__m256 in, __m256 ret){
|
||||
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
|
||||
return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r
|
||||
}
|
||||
//Complex double
|
||||
inline __m256d operator()(__m256d in, __m256d ret){
|
||||
__m256d tmp = _mm256_shuffle_pd(in,in,0x5);
|
||||
return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
//////////////////////////////////////////////
|
||||
|
||||
struct Permute{
|
||||
|
||||
static inline __m256 Permute0(__m256 in){
|
||||
return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD
|
||||
};
|
||||
static inline __m256 Permute1(__m256 in){
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF
|
||||
};
|
||||
static inline __m256 Permute2(__m256 in){
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG
|
||||
};
|
||||
static inline __m256 Permute3(__m256 in){
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline __m256d Permute0(__m256d in){
|
||||
return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB
|
||||
};
|
||||
static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC
|
||||
return _mm256_shuffle_pd(in,in,0x5);
|
||||
};
|
||||
static inline __m256d Permute2(__m256d in){
|
||||
return in;
|
||||
};
|
||||
static inline __m256d Permute3(__m256d in){
|
||||
return in;
|
||||
};
|
||||
};
|
||||
#define USE_FP16
|
||||
struct PrecisionChange {
|
||||
static inline __m256i StoH (__m256 a,__m256 b) {
|
||||
__m256i h;
|
||||
#ifdef USE_FP16
|
||||
__m128i ha = _mm256_cvtps_ph(a,0);
|
||||
__m128i hb = _mm256_cvtps_ph(b,0);
|
||||
h =(__m256i) _mm256_castps128_ps256((__m128)ha);
|
||||
h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1);
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
return h;
|
||||
}
|
||||
static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) {
|
||||
#ifdef USE_FP16
|
||||
sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0));
|
||||
sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1));
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
static inline __m256 DtoS (__m256d a,__m256d b) {
|
||||
__m128 sa = _mm256_cvtpd_ps(a);
|
||||
__m128 sb = _mm256_cvtpd_ps(b);
|
||||
__m256 s = _mm256_castps128_ps256(sa);
|
||||
s = _mm256_insertf128_ps(s,sb,1);
|
||||
return s;
|
||||
}
|
||||
static inline void StoD (__m256 s,__m256d &a,__m256d &b) {
|
||||
a = _mm256_cvtps_pd(_mm256_extractf128_ps(s,0));
|
||||
b = _mm256_cvtps_pd(_mm256_extractf128_ps(s,1));
|
||||
}
|
||||
static inline __m256i DtoH (__m256d a,__m256d b,__m256d c,__m256d d) {
|
||||
__m256 sa,sb;
|
||||
sa = DtoS(a,b);
|
||||
sb = DtoS(c,d);
|
||||
return StoH(sa,sb);
|
||||
}
|
||||
static inline void HtoD (__m256i h,__m256d &a,__m256d &b,__m256d &c,__m256d &d) {
|
||||
__m256 sa,sb;
|
||||
HtoS(h,sa,sb);
|
||||
StoD(sa,a,b);
|
||||
StoD(sb,c,d);
|
||||
}
|
||||
};
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
//Invertible
|
||||
//AB CD -> AC BD
|
||||
//AC BD -> AB CD
|
||||
out1= _mm256_permute2f128_ps(in1,in2,0x20);
|
||||
out2= _mm256_permute2f128_ps(in1,in2,0x31);
|
||||
};
|
||||
static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
//Invertible
|
||||
// ABCD EFGH ->ABEF CDGH
|
||||
// ABEF CDGH ->ABCD EFGH
|
||||
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
// Invertible ?
|
||||
// ABCD EFGH -> ACEG BDFH
|
||||
// ACEG BDFH -> AEBF CGDH
|
||||
// out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
// out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
// Bollocks; need
|
||||
// AECG BFDH -> ABCD EFGH
|
||||
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/
|
||||
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/
|
||||
out1= _mm256_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
out2= _mm256_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
};
|
||||
static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
|
||||
static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
out1= _mm256_permute2f128_pd(in1,in2,0x20);
|
||||
out2= _mm256_permute2f128_pd(in1,in2,0x31);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
out1= _mm256_shuffle_pd(in1,in2,0x0);
|
||||
out2= _mm256_shuffle_pd(in1,in2,0xF);
|
||||
};
|
||||
static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
#if defined (AVX2)
|
||||
#define _mm256_alignr_epi32_grid(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
|
||||
#define _mm256_alignr_epi64_grid(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
|
||||
#endif
|
||||
|
||||
#if defined (AVX1) || defined (AVXFMA)
|
||||
#define _mm256_alignr_epi32_grid(ret,a,b,n) { \
|
||||
__m128 aa, bb; \
|
||||
\
|
||||
aa = _mm256_extractf128_ps(a,1); \
|
||||
bb = _mm256_extractf128_ps(b,1); \
|
||||
aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \
|
||||
ret = _mm256_insertf128_ps(ret,aa,1); \
|
||||
\
|
||||
aa = _mm256_extractf128_ps(a,0); \
|
||||
bb = _mm256_extractf128_ps(b,0); \
|
||||
aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \
|
||||
ret = _mm256_insertf128_ps(ret,aa,0); \
|
||||
}
|
||||
|
||||
#define _mm256_alignr_epi64_grid(ret,a,b,n) { \
|
||||
__m128d aa, bb; \
|
||||
\
|
||||
aa = _mm256_extractf128_pd(a,1); \
|
||||
bb = _mm256_extractf128_pd(b,1); \
|
||||
aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \
|
||||
ret = _mm256_insertf128_pd(ret,aa,1); \
|
||||
\
|
||||
aa = _mm256_extractf128_pd(a,0); \
|
||||
bb = _mm256_extractf128_pd(b,0); \
|
||||
aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \
|
||||
ret = _mm256_insertf128_pd(ret,aa,0); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline __m256 rotate(__m256 in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline __m256d rotate(__m256d in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<int n>
|
||||
static inline __m256 tRotate(__m256 in){
|
||||
__m256 tmp = Permute::Permute0(in);
|
||||
__m256 ret;
|
||||
if ( n > 3 ) {
|
||||
_mm256_alignr_epi32_grid(ret,in,tmp,n);
|
||||
} else {
|
||||
_mm256_alignr_epi32_grid(ret,tmp,in,n);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<int n>
|
||||
static inline __m256d tRotate(__m256d in){
|
||||
__m256d tmp = Permute::Permute0(in);
|
||||
__m256d ret;
|
||||
if ( n > 1 ) {
|
||||
_mm256_alignr_epi64_grid(ret,in,tmp,n);
|
||||
} else {
|
||||
_mm256_alignr_epi64_grid(ret,tmp,in,n);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
|
||||
__m256 v1,v2;
|
||||
v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single
|
||||
v1= _mm256_add_ps(v1,in);
|
||||
v2=Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm256_add_ps(v1,v2);
|
||||
u256f conv; conv.v = v1;
|
||||
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||
}
|
||||
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
|
||||
__m256 v1,v2;
|
||||
v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double
|
||||
v1 = _mm256_add_ps(v1,in);
|
||||
v2 = Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm256_add_ps(v1,v2);
|
||||
v2 = Optimization::Permute::Permute2(v1);
|
||||
v1 = _mm256_add_ps(v1,v2);
|
||||
u256f conv; conv.v=v1;
|
||||
return conv.f[0];
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
|
||||
__m256d v1;
|
||||
v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
|
||||
v1 = _mm256_add_pd(v1,in);
|
||||
u256d conv; conv.v = v1;
|
||||
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
|
||||
__m256d v1,v2;
|
||||
v1 = Optimization::Permute::Permute0(in); // avx 256; quad double
|
||||
v1 = _mm256_add_pd(v1,in);
|
||||
v2 = Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm256_add_pd(v1,v2);
|
||||
u256d conv; conv.v = v1;
|
||||
return conv.f[0];
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
|
||||
__m128i ret;
|
||||
#if defined (AVX2)
|
||||
// AVX2 horizontal adds within upper and lower halves of register; use
|
||||
// SSE to add upper and lower halves for result.
|
||||
__m256i v1, v2;
|
||||
__m128i u1, u2;
|
||||
v1 = _mm256_hadd_epi32(in, in);
|
||||
v2 = _mm256_hadd_epi32(v1, v1);
|
||||
u1 = _mm256_castsi256_si128(v2); // upper half
|
||||
u2 = _mm256_extracti128_si256(v2, 1); // lower half
|
||||
ret = _mm_add_epi32(u1, u2);
|
||||
#else
|
||||
// No AVX horizontal add; extract upper and lower halves of register & use
|
||||
// SSE intrinsics.
|
||||
__m128i u1, u2, u3;
|
||||
u1 = _mm256_extractf128_si256(in, 0); // upper half
|
||||
u2 = _mm256_extractf128_si256(in, 1); // lower half
|
||||
u3 = _mm_add_epi32(u1, u2);
|
||||
u1 = _mm_hadd_epi32(u3, u3);
|
||||
ret = _mm_hadd_epi32(u1, u1);
|
||||
#endif
|
||||
return _mm_cvtsi128_si32(ret);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
|
||||
typedef __m256i SIMD_Htype; // Single precision type
|
||||
typedef __m256 SIMD_Ftype; // Single precision type
|
||||
typedef __m256d SIMD_Dtype; // Double precision type
|
||||
typedef __m256i SIMD_Itype; // Integer type
|
||||
|
||||
// prefecthing
|
||||
inline void v_prefetch0(int size, const char *ptr){
|
||||
for(int i=0;i<size;i+=64){ // Define L1 linesize above
|
||||
_mm_prefetch(ptr+i+4096,_MM_HINT_T1);
|
||||
_mm_prefetch(ptr+i+512,_MM_HINT_T0);
|
||||
}
|
||||
}
|
||||
inline void prefetch_HINT_T0(const char *ptr){
|
||||
_mm_prefetch(ptr, _MM_HINT_T0);
|
||||
}
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S, T>;
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Div DivSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
} // namespace Grid
|
640
Grid/simd/Grid_avx512.h
Normal file
640
Grid/simd/Grid_avx512.h
Normal file
@ -0,0 +1,640 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Grid_avx512.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: neo <cossu@post.kek.jp>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <immintrin.h>
|
||||
|
||||
|
||||
namespace Grid{
|
||||
namespace Optimization {
|
||||
|
||||
union u512f {
|
||||
__m512 v;
|
||||
float f[16];
|
||||
};
|
||||
|
||||
union u512d {
|
||||
__m512d v;
|
||||
double f[8];
|
||||
};
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
inline __m512 operator()(float a, float b){
|
||||
return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
|
||||
}
|
||||
// Real float
|
||||
inline __m512 operator()(float a){
|
||||
return _mm512_set1_ps(a);
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(double a, double b){
|
||||
return _mm512_set_pd(b,a,b,a,b,a,b,a);
|
||||
}
|
||||
//Real double
|
||||
inline __m512d operator()(double a){
|
||||
return _mm512_set1_pd(a);
|
||||
}
|
||||
//Integer
|
||||
inline __m512i operator()(Integer a){
|
||||
return _mm512_set1_epi32(a);
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(__m512 a, float* F){
|
||||
_mm512_store_ps(F,a);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(__m512d a, double* D){
|
||||
_mm512_store_pd(D,a);
|
||||
}
|
||||
//Integer
|
||||
inline void operator()(__m512i a, Integer* I){
|
||||
_mm512_store_si512((__m512i *)I,a);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
struct Vstream{
|
||||
//Float
|
||||
inline void operator()(float * a, __m512 b){
|
||||
_mm512_stream_ps(a,b);
|
||||
// _mm512_store_ps(a,b);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(double * a, __m512d b){
|
||||
_mm512_stream_pd(a,b);
|
||||
// _mm512_store_pd(a,b);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct Vset{
|
||||
// Complex float
|
||||
inline __m512 operator()(Grid::ComplexF *a){
|
||||
return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
|
||||
a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
|
||||
a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
|
||||
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
}
|
||||
// Complex double
|
||||
inline __m512d operator()(Grid::ComplexD *a){
|
||||
return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
|
||||
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
}
|
||||
// Real float
|
||||
inline __m512 operator()(float *a){
|
||||
return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
|
||||
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
// Real double
|
||||
inline __m512d operator()(double *a){
|
||||
return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
// Integer
|
||||
inline __m512i operator()(Integer *a){
|
||||
return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
|
||||
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
//General form must generate error if compiled
|
||||
inline Out_type operator()(In_type in){
|
||||
printf("Error, using wrong Reduce function\n");
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
return _mm512_add_ps(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
return _mm512_add_pd(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline __m512i operator()(__m512i a, __m512i b){
|
||||
return _mm512_add_epi32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
return _mm512_sub_ps(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
return _mm512_sub_pd(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline __m512i operator()(__m512i a, __m512i b){
|
||||
return _mm512_sub_epi32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
// Note, we can beat the shuf overhead in chain with two temporaries
|
||||
// Ar Ai , Br Bi, Ai Ar // one shuf
|
||||
//tmpr Ar Br, Ai Bi // Mul/Mac/Mac
|
||||
//tmpi Br Ai, Bi Ar // Mul/Mac/Mac
|
||||
// add tmpi,shuf(tmpi)
|
||||
// sub tmpr,shuf(tmpi)
|
||||
// shuf(tmpr,tmpi). // Could drop/trade for write mask
|
||||
|
||||
// Gives
|
||||
// 2mul,4 mac +add+sub = 8 flop type insns
|
||||
// 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load.
|
||||
|
||||
struct MultRealPart{
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
__m512 ymm0;
|
||||
ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
|
||||
return _mm512_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
}
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
__m512d ymm0;
|
||||
ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00
|
||||
return _mm512_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
}
|
||||
};
|
||||
struct MaddRealPart{
|
||||
inline __m512 operator()(__m512 a, __m512 b, __m512 c){
|
||||
__m512 ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
|
||||
return _mm512_fmadd_ps( ymm0, b, c);
|
||||
}
|
||||
inline __m512d operator()(__m512d a, __m512d b, __m512d c){
|
||||
__m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 );
|
||||
return _mm512_fmadd_pd( ymm0, b, c);
|
||||
}
|
||||
};
|
||||
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
// dup, dup, perm, mul, madd
|
||||
__m512 a_real = _mm512_moveldup_ps( a ); // Ar Ar
|
||||
__m512 a_imag = _mm512_movehdup_ps( a ); // Ai Ai
|
||||
a_imag = _mm512_mul_ps( a_imag, _mm512_permute_ps( b, 0xB1 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||
return _mm512_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||
}
|
||||
// Complex double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
__m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
|
||||
__m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
|
||||
a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
|
||||
return _mm512_fmaddsub_pd( a_real, b, a_imag );
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
|
||||
inline void mac(__m512 &a, __m512 b, __m512 c){
|
||||
a= _mm512_fmadd_ps( b, c, a);
|
||||
}
|
||||
inline void mac(__m512d &a, __m512d b, __m512d c){
|
||||
a= _mm512_fmadd_pd( b, c, a);
|
||||
}
|
||||
// Real float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
return _mm512_mul_ps(a,b);
|
||||
}
|
||||
// Real double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
return _mm512_mul_pd(a,b);
|
||||
}
|
||||
// Integer
|
||||
inline __m512i operator()(__m512i a, __m512i b){
|
||||
return _mm512_mullo_epi32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
struct Div{
|
||||
// Real float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
return _mm512_div_ps(a,b);
|
||||
}
|
||||
// Real double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
return _mm512_div_pd(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct Conj{
|
||||
// Complex single
|
||||
inline __m512 operator()(__m512 in){
|
||||
return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag
|
||||
}
|
||||
// Complex double
|
||||
inline __m512d operator()(__m512d in){
|
||||
return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
//__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
||||
//return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E??
|
||||
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
//__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
||||
//return _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||
__m512d tmp = _mm512_shuffle_pd(in,in,0x55);
|
||||
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
|
||||
}
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp);
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_shuffle_pd(in,in,0x55);
|
||||
return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
// Gpermute utilities consider coalescing into 1 Gpermute
|
||||
struct Permute{
|
||||
|
||||
static inline __m512 Permute0(__m512 in){
|
||||
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512 Permute1(__m512 in){
|
||||
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m512 Permute2(__m512 in){
|
||||
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512 Permute3(__m512 in){
|
||||
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
|
||||
static inline __m512d Permute0(__m512d in){
|
||||
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512d Permute1(__m512d in){
|
||||
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m512d Permute2(__m512d in){
|
||||
return _mm512_shuffle_pd(in,in,0x55);
|
||||
};
|
||||
static inline __m512d Permute3(__m512d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
#define USE_FP16
|
||||
struct PrecisionChange {
|
||||
static inline __m512i StoH (__m512 a,__m512 b) {
|
||||
__m512i h;
|
||||
#ifdef USE_FP16
|
||||
__m256i ha = _mm512_cvtps_ph(a,0);
|
||||
__m256i hb = _mm512_cvtps_ph(b,0);
|
||||
h =(__m512i) _mm512_castps256_ps512((__m256)ha);
|
||||
h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m256d)hb,1);
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
return h;
|
||||
}
|
||||
static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) {
|
||||
#ifdef USE_FP16
|
||||
sa = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,0));
|
||||
sb = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,1));
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
static inline __m512 DtoS (__m512d a,__m512d b) {
|
||||
__m256 sa = _mm512_cvtpd_ps(a);
|
||||
__m256 sb = _mm512_cvtpd_ps(b);
|
||||
__m512 s = _mm512_castps256_ps512(sa);
|
||||
s =(__m512) _mm512_insertf64x4((__m512d)s,(__m256d)sb,1);
|
||||
return s;
|
||||
}
|
||||
static inline void StoD (__m512 s,__m512d &a,__m512d &b) {
|
||||
a = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,0));
|
||||
b = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,1));
|
||||
}
|
||||
static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) {
|
||||
__m512 sa,sb;
|
||||
sa = DtoS(a,b);
|
||||
sb = DtoS(c,d);
|
||||
return StoH(sa,sb);
|
||||
}
|
||||
static inline void HtoD (__m512i h,__m512d &a,__m512d &b,__m512d &c,__m512d &d) {
|
||||
__m512 sa,sb;
|
||||
HtoS(h,sa,sb);
|
||||
StoD(sa,a,b);
|
||||
StoD(sb,c,d);
|
||||
}
|
||||
};
|
||||
// On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
|
||||
// On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
|
||||
// The operation is its own inverse
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
out1= _mm512_shuffle_f32x4(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
out2= _mm512_shuffle_f32x4(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
};
|
||||
static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
out1= _mm512_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
out2= _mm512_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
};
|
||||
|
||||
static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
out1= _mm512_shuffle_f64x2(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
out2= _mm512_shuffle_f64x2(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
};
|
||||
static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
out1 = _mm512_shuffle_pd(in1,in2,0x00);
|
||||
out2 = _mm512_shuffle_pd(in1,in2,0xFF);
|
||||
};
|
||||
static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline __m512 rotate(__m512 in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
|
||||
case 8 : return tRotate<8>(in);break;
|
||||
case 9 : return tRotate<9>(in);break;
|
||||
case 10: return tRotate<10>(in);break;
|
||||
case 11: return tRotate<11>(in);break;
|
||||
case 12: return tRotate<12>(in);break;
|
||||
case 13: return tRotate<13>(in);break;
|
||||
case 14: return tRotate<14>(in);break;
|
||||
case 15: return tRotate<15>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline __m512d rotate(__m512d in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
template<int n> static inline __m512 tRotate(__m512 in){
|
||||
return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
|
||||
};
|
||||
|
||||
template<int n> static inline __m512d tRotate(__m512d in){
|
||||
return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
// Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
|
||||
#ifndef __INTEL_COMPILER
|
||||
#warning "Slow reduction due to incomplete reduce intrinsics"
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
|
||||
__m512 v1,v2;
|
||||
v1=Optimization::Permute::Permute0(in); // avx 512; quad complex single
|
||||
v1= _mm512_add_ps(v1,in);
|
||||
v2=Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm512_add_ps(v1,v2);
|
||||
v2=Optimization::Permute::Permute2(v1);
|
||||
v1 = _mm512_add_ps(v1,v2);
|
||||
u512f conv; conv.v = v1;
|
||||
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||
}
|
||||
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
|
||||
__m512 v1,v2;
|
||||
v1 = Optimization::Permute::Permute0(in); // avx 512; octo-double
|
||||
v1 = _mm512_add_ps(v1,in);
|
||||
v2 = Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm512_add_ps(v1,v2);
|
||||
v2 = Optimization::Permute::Permute2(v1);
|
||||
v1 = _mm512_add_ps(v1,v2);
|
||||
v2 = Optimization::Permute::Permute3(v1);
|
||||
v1 = _mm512_add_ps(v1,v2);
|
||||
u512f conv; conv.v=v1;
|
||||
return conv.f[0];
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
|
||||
__m512d v1;
|
||||
v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
|
||||
v1 = _mm512_add_pd(v1,in);
|
||||
v1 = Optimization::Permute::Permute1(in); // sse 128; paired complex single
|
||||
v1 = _mm512_add_pd(v1,in);
|
||||
u512d conv; conv.v = v1;
|
||||
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
|
||||
__m512d v1,v2;
|
||||
v1 = Optimization::Permute::Permute0(in); // avx 512; quad double
|
||||
v1 = _mm512_add_pd(v1,in);
|
||||
v2 = Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm512_add_pd(v1,v2);
|
||||
v2 = Optimization::Permute::Permute2(v1);
|
||||
v1 = _mm512_add_pd(v1,v2);
|
||||
u512d conv; conv.v = v1;
|
||||
return conv.f[0];
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
|
||||
// No full vector reduce, use AVX to add upper and lower halves of register
|
||||
// and perform AVX reduction.
|
||||
__m256i v1, v2, v3;
|
||||
__m128i u1, u2, ret;
|
||||
v1 = _mm512_castsi512_si256(in); // upper half
|
||||
v2 = _mm512_extracti32x8_epi32(in, 1); // lower half
|
||||
v3 = _mm256_add_epi32(v1, v2);
|
||||
v1 = _mm256_hadd_epi32(v3, v3);
|
||||
v2 = _mm256_hadd_epi32(v1, v1);
|
||||
u1 = _mm256_castsi256_si128(v2); // upper half
|
||||
u2 = _mm256_extracti128_si256(v2, 1); // lower half
|
||||
ret = _mm_add_epi32(u1, u2);
|
||||
return _mm_cvtsi128_si32(ret);
|
||||
}
|
||||
#else
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
|
||||
return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
|
||||
return _mm512_reduce_add_ps(in);
|
||||
}
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
|
||||
return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
|
||||
return _mm512_reduce_add_pd(in);
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
|
||||
return _mm512_reduce_add_epi32(in);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
|
||||
|
||||
typedef __m512i SIMD_Htype; // Single precision type
|
||||
typedef __m512 SIMD_Ftype; // Single precision type
|
||||
typedef __m512d SIMD_Dtype; // Double precision type
|
||||
typedef __m512i SIMD_Itype; // Integer type
|
||||
|
||||
// prefecth
|
||||
inline void v_prefetch0(int size, const char *ptr){
|
||||
for(int i=0;i<size;i+=64){ // Define L1 linesize above
|
||||
_mm_prefetch(ptr+i+4096,_MM_HINT_T1);
|
||||
_mm_prefetch(ptr+i+512,_MM_HINT_T0);
|
||||
}
|
||||
}
|
||||
inline void prefetch_HINT_T0(const char *ptr){
|
||||
_mm_prefetch(ptr,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::Div DivSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
}
|
530
Grid/simd/Grid_generic.h
Normal file
530
Grid/simd/Grid_generic.h
Normal file
@ -0,0 +1,530 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Grid_generic.h
|
||||
|
||||
Copyright (C) 2015
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Antonin Portelli <antonin.portelli@me.com>
|
||||
Andrew Lawson <andrew.lawson1991@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include "Grid_generic_types.h"
|
||||
|
||||
namespace Grid {
|
||||
namespace Optimization {
|
||||
|
||||
struct Vsplat{
|
||||
// Complex
|
||||
template <typename T>
|
||||
inline vec<T> operator()(T a, T b){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::r, 2)
|
||||
{
|
||||
out.v[i] = a;
|
||||
out.v[i+1] = b;
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// Real
|
||||
template <typename T>
|
||||
inline vec<T> operator()(T a){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::r, 1)
|
||||
{
|
||||
out.v[i] = a;
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
// Real
|
||||
template <typename T>
|
||||
inline void operator()(vec<T> a, T *D){
|
||||
*((vec<T> *)D) = a;
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstream{
|
||||
// Real
|
||||
template <typename T>
|
||||
inline void operator()(T * a, vec<T> b){
|
||||
*((vec<T> *)a) = b;
|
||||
}
|
||||
};
|
||||
|
||||
struct Vset{
|
||||
// Complex
|
||||
template <typename T>
|
||||
inline vec<T> operator()(std::complex<T> *a){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::c, 1)
|
||||
{
|
||||
out.v[2*i] = a[i].real();
|
||||
out.v[2*i+1] = a[i].imag();
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// Real
|
||||
template <typename T>
|
||||
inline vec<T> operator()(T *a){
|
||||
vec<T> out;
|
||||
|
||||
out = *((vec<T> *)a);
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
// Complex/Real
|
||||
template <typename T>
|
||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::r, 1)
|
||||
{
|
||||
out.v[i] = a.v[i] + b.v[i];
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
// Complex/Real
|
||||
template <typename T>
|
||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::r, 1)
|
||||
{
|
||||
out.v[i] = a.v[i] - b.v[i];
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
// Real
|
||||
template <typename T>
|
||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::r, 1)
|
||||
{
|
||||
out.v[i] = a.v[i]*b.v[i];
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
#define cmul(a, b, c, i)\
|
||||
c[i] = a[i]*b[i] - a[i+1]*b[i+1];\
|
||||
c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
|
||||
|
||||
struct MultRealPart{
|
||||
template <typename T>
|
||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::c, 1)
|
||||
{
|
||||
out.v[2*i] = a.v[2*i]*b.v[2*i];
|
||||
out.v[2*i+1] = a.v[2*i]*b.v[2*i+1];
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct MaddRealPart{
|
||||
template <typename T>
|
||||
inline vec<T> operator()(vec<T> a, vec<T> b, vec<T> c){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::c, 1)
|
||||
{
|
||||
out.v[2*i] = a.v[2*i]*b.v[2*i] + c.v[2*i];
|
||||
out.v[2*i+1] = a.v[2*i]*b.v[2*i+1] + c.v[2*i+1];
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct MultComplex{
|
||||
// Complex
|
||||
template <typename T>
|
||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::c, 1)
|
||||
{
|
||||
cmul(a.v, b.v, out.v, 2*i);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
#undef cmul
|
||||
|
||||
struct Div{
|
||||
// Real
|
||||
template <typename T>
|
||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::r, 1)
|
||||
{
|
||||
out.v[i] = a.v[i]/b.v[i];
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
#define conj(a, b, i)\
|
||||
b[i] = a[i];\
|
||||
b[i+1] = -a[i+1];
|
||||
|
||||
struct Conj{
|
||||
// Complex
|
||||
template <typename T>
|
||||
inline vec<T> operator()(vec<T> a){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::c, 1)
|
||||
{
|
||||
conj(a.v, out.v, 2*i);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
#undef conj
|
||||
|
||||
#define timesmi(a, b, i)\
|
||||
b[i] = a[i+1];\
|
||||
b[i+1] = -a[i];
|
||||
|
||||
struct TimesMinusI{
|
||||
// Complex
|
||||
template <typename T>
|
||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::c, 1)
|
||||
{
|
||||
timesmi(a.v, out.v, 2*i);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
#undef timesmi
|
||||
|
||||
#define timesi(a, b, i)\
|
||||
b[i] = -a[i+1];\
|
||||
b[i+1] = a[i];
|
||||
|
||||
struct TimesI{
|
||||
// Complex
|
||||
template <typename T>
|
||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
||||
vec<T> out;
|
||||
|
||||
VECTOR_FOR(i, W<T>::c, 1)
|
||||
{
|
||||
timesi(a.v, out.v, 2*i);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
#undef timesi
|
||||
|
||||
struct PrecisionChange {
|
||||
static inline vech StoH (const vecf &a,const vecf &b) {
|
||||
vech ret;
|
||||
#ifdef USE_FP16
|
||||
vech *ha = (vech *)&a;
|
||||
vech *hb = (vech *)&b;
|
||||
const int nf = W<float>::r;
|
||||
// VECTOR_FOR(i, nf,1){ ret.v[i] = ( (uint16_t *) &a.v[i])[1] ; }
|
||||
// VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; }
|
||||
VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; }
|
||||
VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; }
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
static inline void HtoS (vech h,vecf &sa,vecf &sb) {
|
||||
#ifdef USE_FP16
|
||||
const int nf = W<float>::r;
|
||||
const int nh = W<uint16_t>::r;
|
||||
vech *ha = (vech *)&sa;
|
||||
vech *hb = (vech *)&sb;
|
||||
VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; }
|
||||
// VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];}
|
||||
// VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];}
|
||||
VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; }
|
||||
VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; }
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
static inline vecf DtoS (vecd a,vecd b) {
|
||||
const int nd = W<double>::r;
|
||||
const int nf = W<float>::r;
|
||||
vecf ret;
|
||||
VECTOR_FOR(i, nd,1){ ret.v[i] = a.v[i] ; }
|
||||
VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; }
|
||||
return ret;
|
||||
}
|
||||
static inline void StoD (vecf s,vecd &a,vecd &b) {
|
||||
const int nd = W<double>::r;
|
||||
VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; }
|
||||
VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; }
|
||||
}
|
||||
static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) {
|
||||
vecf sa,sb;
|
||||
sa = DtoS(a,b);
|
||||
sb = DtoS(c,d);
|
||||
return StoH(sa,sb);
|
||||
}
|
||||
static inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) {
|
||||
vecf sa,sb;
|
||||
HtoS(h,sa,sb);
|
||||
StoD(sa,a,b);
|
||||
StoD(sb,c,d);
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Exchange support
|
||||
struct Exchange{
|
||||
|
||||
template <typename T,int n>
|
||||
static inline void ExchangeN(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
|
||||
const int w = W<T>::r;
|
||||
unsigned int mask = w >> (n + 1);
|
||||
// std::cout << " Exchange "<<n<<" nsimd "<<w<<" mask 0x" <<std::hex<<mask<<std::dec<<std::endl;
|
||||
VECTOR_FOR(i, w, 1) {
|
||||
int j1 = i&(~mask);
|
||||
if ( (i&mask) == 0 ) { out1.v[i]=in1.v[j1];}
|
||||
else { out1.v[i]=in2.v[j1];}
|
||||
int j2 = i|mask;
|
||||
if ( (i&mask) == 0 ) { out2.v[i]=in1.v[j2];}
|
||||
else { out2.v[i]=in2.v[j2];}
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
static inline void Exchange0(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
|
||||
ExchangeN<T,0>(out1,out2,in1,in2);
|
||||
};
|
||||
template <typename T>
|
||||
static inline void Exchange1(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
|
||||
ExchangeN<T,1>(out1,out2,in1,in2);
|
||||
};
|
||||
template <typename T>
|
||||
static inline void Exchange2(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
|
||||
ExchangeN<T,2>(out1,out2,in1,in2);
|
||||
};
|
||||
template <typename T>
|
||||
static inline void Exchange3(vec<T> &out1,vec<T> &out2,vec<T> &in1,vec<T> &in2){
|
||||
ExchangeN<T,3>(out1,out2,in1,in2);
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
#define perm(a, b, n, w)\
|
||||
unsigned int _mask = w >> (n + 1);\
|
||||
VECTOR_FOR(i, w, 1)\
|
||||
{\
|
||||
b[i] = a[i^_mask];\
|
||||
}
|
||||
|
||||
#define DECL_PERMUTE_N(n)\
|
||||
template <typename T>\
|
||||
static inline vec<T> Permute##n(vec<T> in) {\
|
||||
vec<T> out;\
|
||||
perm(in.v, out.v, n, W<T>::r);\
|
||||
return out;\
|
||||
}
|
||||
|
||||
struct Permute{
|
||||
DECL_PERMUTE_N(0);
|
||||
DECL_PERMUTE_N(1);
|
||||
DECL_PERMUTE_N(2);
|
||||
DECL_PERMUTE_N(3);
|
||||
};
|
||||
|
||||
#undef perm
|
||||
#undef DECL_PERMUTE_N
|
||||
|
||||
#define rot(a, b, n, w)\
|
||||
VECTOR_FOR(i, w, 1)\
|
||||
{\
|
||||
b[i] = a[(i + n)%w];\
|
||||
}
|
||||
|
||||
struct Rotate{
|
||||
|
||||
template <int n, typename T> static inline vec<T> tRotate(vec<T> in){
|
||||
return rotate(in, n);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline vec<T> rotate(vec<T> in, int n){
|
||||
vec<T> out;
|
||||
|
||||
rot(in.v, out.v, n, W<T>::r);
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
#undef rot
|
||||
|
||||
#define acc(v, a, off, step, n)\
|
||||
for (unsigned int i = off; i < n; i += step)\
|
||||
{\
|
||||
a += v[i];\
|
||||
}
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
//General form must generate error if compiled
|
||||
inline Out_type operator()(In_type in){
|
||||
printf("Error, using wrong Reduce function\n");
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
//Complex float Reduce
|
||||
template <>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, vecf>::operator()(vecf in){
|
||||
float a = 0.f, b = 0.f;
|
||||
|
||||
acc(in.v, a, 0, 2, W<float>::r);
|
||||
acc(in.v, b, 1, 2, W<float>::r);
|
||||
|
||||
return Grid::ComplexF(a, b);
|
||||
}
|
||||
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, vecf>::operator()(vecf in){
|
||||
float a = 0.;
|
||||
|
||||
acc(in.v, a, 0, 1, W<float>::r);
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, vecd>::operator()(vecd in){
|
||||
double a = 0., b = 0.;
|
||||
|
||||
acc(in.v, a, 0, 2, W<double>::r);
|
||||
acc(in.v, b, 1, 2, W<double>::r);
|
||||
|
||||
return Grid::ComplexD(a, b);
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, vecd>::operator()(vecd in){
|
||||
double a = 0.f;
|
||||
|
||||
acc(in.v, a, 0, 1, W<double>::r);
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, veci>::operator()(veci in){
|
||||
Integer a = 0;
|
||||
|
||||
acc(in.v, a, 0, 1, W<Integer>::r);
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
#undef acc // EIGEN compatibility
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
|
||||
typedef Optimization::vech SIMD_Htype; // Reduced precision type
|
||||
typedef Optimization::vecf SIMD_Ftype; // Single precision type
|
||||
typedef Optimization::vecd SIMD_Dtype; // Double precision type
|
||||
typedef Optimization::veci SIMD_Itype; // Integer type
|
||||
|
||||
// prefetch utilities
|
||||
inline void v_prefetch0(int size, const char *ptr){};
|
||||
inline void prefetch_HINT_T0(const char *ptr){};
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Div DivSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
}
|
85
Grid/simd/Grid_generic_types.h
Normal file
85
Grid/simd/Grid_generic_types.h
Normal file
@ -0,0 +1,85 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Grid_generic_types.h
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Antonin Portelli <antonin.portelli@me.com>
|
||||
Andrew Lawson <andrew.lawson1991@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
|
||||
|
||||
//#define VECTOR_LOOPS
|
||||
|
||||
// playing with compiler pragmas
|
||||
#ifdef VECTOR_LOOPS
|
||||
#ifdef __clang__
|
||||
#define VECTOR_FOR(i, w, inc)\
|
||||
_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\
|
||||
for (unsigned int i = 0; i < w; i += inc)
|
||||
#elif defined __INTEL_COMPILER
|
||||
#define VECTOR_FOR(i, w, inc)\
|
||||
_Pragma("simd vectorlength(w*8)")\
|
||||
for (unsigned int i = 0; i < w; i += inc)
|
||||
#else
|
||||
#define VECTOR_FOR(i, w, inc)\
|
||||
for (unsigned int i = 0; i < w; i += inc)
|
||||
#endif
|
||||
#else
|
||||
#define VECTOR_FOR(i, w, inc)\
|
||||
for (unsigned int i = 0; i < w; i += inc)
|
||||
#endif
|
||||
|
||||
namespace Grid {
|
||||
namespace Optimization {
|
||||
|
||||
// type traits giving the number of elements for each vector type
|
||||
template <typename T> struct W;
|
||||
template <> struct W<double> {
|
||||
constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
|
||||
constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
|
||||
};
|
||||
template <> struct W<float> {
|
||||
constexpr static unsigned int c = GEN_SIMD_WIDTH/8u;
|
||||
constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
|
||||
};
|
||||
template <> struct W<Integer> {
|
||||
constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
|
||||
};
|
||||
template <> struct W<uint16_t> {
|
||||
constexpr static unsigned int c = GEN_SIMD_WIDTH/4u;
|
||||
constexpr static unsigned int r = GEN_SIMD_WIDTH/2u;
|
||||
};
|
||||
|
||||
// SIMD vector types
|
||||
template <typename T>
|
||||
struct vec {
|
||||
alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
|
||||
};
|
||||
|
||||
typedef vec<float> vecf;
|
||||
typedef vec<double> vecd;
|
||||
typedef vec<uint16_t> vech; // half precision comms
|
||||
typedef vec<Integer> veci;
|
||||
|
||||
}}
|
448
Grid/simd/Grid_imci.h
Normal file
448
Grid/simd/Grid_imci.h
Normal file
@ -0,0 +1,448 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Grid_imci.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <zmmintrin.h>
|
||||
|
||||
namespace Grid{
|
||||
namespace Optimization {
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
inline __m512 operator()(float a, float b){
|
||||
return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
|
||||
}
|
||||
// Real float
|
||||
inline __m512 operator()(float a){
|
||||
return _mm512_set1_ps(a);
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(double a, double b){
|
||||
return _mm512_set_pd(b,a,b,a,b,a,b,a);
|
||||
}
|
||||
//Real double
|
||||
inline __m512d operator()(double a){
|
||||
return _mm512_set1_pd(a);
|
||||
}
|
||||
//Integer
|
||||
inline __m512i operator()(Integer a){
|
||||
return _mm512_set1_epi32(a);
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(__m512 a, float* F){
|
||||
_mm512_store_ps(F,a);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(__m512d a, double* D){
|
||||
_mm512_store_pd(D,a);
|
||||
}
|
||||
//Integer
|
||||
inline void operator()(__m512i a, Integer* I){
|
||||
_mm512_store_si512((__m512i *)I,a);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
struct Vstream{
|
||||
//Float
|
||||
inline void operator()(float * a, __m512 b){
|
||||
_mm512_storenrngo_ps(a,b);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(double * a, __m512d b){
|
||||
_mm512_storenrngo_pd(a,b);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct Vset{
|
||||
// Complex float
|
||||
inline __m512 operator()(Grid::ComplexF *a){
|
||||
return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
|
||||
a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
|
||||
a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
|
||||
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
}
|
||||
// Complex double
|
||||
inline __m512d operator()(Grid::ComplexD *a){
|
||||
return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
|
||||
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
}
|
||||
// Real float
|
||||
inline __m512 operator()(float *a){
|
||||
return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
|
||||
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
// Real double
|
||||
inline __m512d operator()(double *a){
|
||||
return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
// Integer
|
||||
inline __m512i operator()(Integer *a){
|
||||
return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
|
||||
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
//General form must generate error if compiled
|
||||
inline Out_type operator()(In_type in){
|
||||
printf("Error, using wrong Reduce function\n");
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
return _mm512_add_ps(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
return _mm512_add_pd(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline __m512i operator()(__m512i a, __m512i b){
|
||||
return _mm512_add_epi32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
return _mm512_sub_ps(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
return _mm512_sub_pd(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline __m512i operator()(__m512i a, __m512i b){
|
||||
return _mm512_sub_epi32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
__m512 vzero,ymm0,ymm1,real, imag;
|
||||
vzero = _mm512_setzero_ps();
|
||||
ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); //
|
||||
real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
|
||||
imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
|
||||
ymm1 = _mm512_mul_ps(real, b);
|
||||
ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
|
||||
return _mm512_fmadd_ps(ymm0,imag,ymm1);
|
||||
}
|
||||
// Complex double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
/* This is from
|
||||
* Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets
|
||||
* @inproceedings{McFarlin:2011:ASV:1995896.1995938,
|
||||
* author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
|
||||
* title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
|
||||
* booktitle = {Proceedings of the International Conference on Supercomputing},
|
||||
* series = {ICS '11},
|
||||
* year = {2011},
|
||||
* isbn = {978-1-4503-0102-2},
|
||||
* location = {Tucson, Arizona, USA},
|
||||
* pages = {265--274},
|
||||
* numpages = {10},
|
||||
* url = {http://doi.acm.org/10.1145/1995896.1995938},
|
||||
* doi = {10.1145/1995896.1995938},
|
||||
* acmid = {1995938},
|
||||
* publisher = {ACM},
|
||||
* address = {New York, NY, USA},
|
||||
* keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
|
||||
* }
|
||||
*/
|
||||
__m512d vzero,ymm0,ymm1,real,imag;
|
||||
vzero =_mm512_setzero_pd();
|
||||
ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); //
|
||||
real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
|
||||
imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
|
||||
ymm1 = _mm512_mul_pd(real, b);
|
||||
ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
|
||||
return _mm512_fmadd_pd(ymm0,imag,ymm1);
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
|
||||
inline void mac(__m512 &a, __m512 b, __m512 c){
|
||||
a= _mm512_fmadd_ps( b, c, a);
|
||||
}
|
||||
|
||||
inline void mac(__m512d &a, __m512d b, __m512d c){
|
||||
a= _mm512_fmadd_pd( b, c, a);
|
||||
}
|
||||
|
||||
// Real float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
return _mm512_mul_ps(a,b);
|
||||
}
|
||||
// Real double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
return _mm512_mul_pd(a,b);
|
||||
}
|
||||
// Integer
|
||||
inline __m512i operator()(__m512i a, __m512i b){
|
||||
return _mm512_mullo_epi32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
struct Div{
|
||||
// Real float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
return _mm512_div_ps(a,b);
|
||||
}
|
||||
// Real double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
return _mm512_div_pd(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct Conj{
|
||||
// Complex single
|
||||
inline __m512 operator()(__m512 in){
|
||||
return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag
|
||||
}
|
||||
// Complex double
|
||||
inline __m512d operator()(__m512d in){
|
||||
return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
||||
return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
||||
return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
|
||||
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
|
||||
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
struct Permute{
|
||||
|
||||
static inline __m512 Permute0(__m512 in){
|
||||
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512 Permute1(__m512 in){
|
||||
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m512 Permute2(__m512 in){
|
||||
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
|
||||
};
|
||||
static inline __m512 Permute3(__m512 in){
|
||||
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB);
|
||||
};
|
||||
|
||||
static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
|
||||
return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512d Permute1(__m512d in){
|
||||
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
|
||||
};
|
||||
static inline __m512d Permute2(__m512d in){
|
||||
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
|
||||
};
|
||||
static inline __m512d Permute3(__m512d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline __m512 rotate(__m512 in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
|
||||
case 8 : return tRotate<8>(in);break;
|
||||
case 9 : return tRotate<9>(in);break;
|
||||
case 10: return tRotate<10>(in);break;
|
||||
case 11: return tRotate<11>(in);break;
|
||||
case 12: return tRotate<12>(in);break;
|
||||
case 13: return tRotate<13>(in);break;
|
||||
case 14: return tRotate<14>(in);break;
|
||||
case 15: return tRotate<15>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline __m512d rotate(__m512d in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
case 4: return tRotate<4>(in);break;
|
||||
case 5: return tRotate<5>(in);break;
|
||||
case 6: return tRotate<6>(in);break;
|
||||
case 7: return tRotate<7>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
template<int n> static inline __m512 tRotate(__m512 in){
|
||||
return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
|
||||
};
|
||||
|
||||
template<int n> static inline __m512d tRotate(__m512d in){
|
||||
return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
|
||||
return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
|
||||
return _mm512_reduce_add_ps(in);
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
|
||||
return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
|
||||
return _mm512_reduce_add_pd(in);
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
|
||||
return _mm512_reduce_add_epi32(in);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
|
||||
typedef __m512 SIMD_Ftype; // Single precision type
|
||||
typedef __m512d SIMD_Dtype; // Double precision type
|
||||
typedef __m512i SIMD_Itype; // Integer type
|
||||
|
||||
// prefecth
|
||||
inline void v_prefetch0(int size, const char *ptr){
|
||||
for(int i=0;i<size;i+=64){ // Define L1 linesize above
|
||||
_mm_prefetch(ptr+i+4096,_MM_HINT_T1);
|
||||
_mm_prefetch(ptr+i+512,_MM_HINT_T0);
|
||||
}
|
||||
}
|
||||
inline void prefetch_HINT_T0(const char *ptr){
|
||||
_mm_prefetch(ptr,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Div DivSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
}
|
599
Grid/simd/Grid_neon.h
Normal file
599
Grid/simd/Grid_neon.h
Normal file
@ -0,0 +1,599 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Grid_neon.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Nils Meyer <nils.meyer@ur.de>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: neo <cossu@post.kek.jp>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
/*
|
||||
|
||||
ARMv8 NEON intrinsics layer by
|
||||
|
||||
Nils Meyer <nils.meyer@ur.de>,
|
||||
University of Regensburg, Germany
|
||||
SFB/TRR55
|
||||
|
||||
*/
|
||||
|
||||
#ifndef GEN_SIMD_WIDTH
|
||||
#define GEN_SIMD_WIDTH 16u
|
||||
#endif
|
||||
|
||||
#include "Grid_generic_types.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
namespace Grid {
|
||||
namespace Optimization {
|
||||
|
||||
template<class vtype>
|
||||
union uconv {
|
||||
float32x4_t f;
|
||||
vtype v;
|
||||
};
|
||||
union u128f {
|
||||
float32x4_t v;
|
||||
float f[4];
|
||||
};
|
||||
union u128d {
|
||||
float64x2_t v;
|
||||
double f[2];
|
||||
};
|
||||
// half precision
|
||||
union u128h {
|
||||
float16x8_t v;
|
||||
uint16_t f[8];
|
||||
};
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
inline float32x4_t operator()(float a, float b){
|
||||
float tmp[4]={a,b,a,b};
|
||||
return vld1q_f32(tmp);
|
||||
}
|
||||
// Real float
|
||||
inline float32x4_t operator()(float a){
|
||||
return vdupq_n_f32(a);
|
||||
}
|
||||
//Complex double
|
||||
inline float64x2_t operator()(double a, double b){
|
||||
double tmp[2]={a,b};
|
||||
return vld1q_f64(tmp);
|
||||
}
|
||||
//Real double
|
||||
inline float64x2_t operator()(double a){
|
||||
return vdupq_n_f64(a);
|
||||
}
|
||||
//Integer
|
||||
inline uint32x4_t operator()(Integer a){
|
||||
return vdupq_n_u32(a);
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(float32x4_t a, float* F){
|
||||
vst1q_f32(F, a);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(float64x2_t a, double* D){
|
||||
vst1q_f64(D, a);
|
||||
}
|
||||
//Integer
|
||||
inline void operator()(uint32x4_t a, Integer* I){
|
||||
vst1q_u32(I, a);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
|
||||
//Float // N:generic
|
||||
inline void operator()(float * a, float32x4_t b){
|
||||
memcpy(a,&b,4*sizeof(float));
|
||||
}
|
||||
//Double // N:generic
|
||||
inline void operator()(double * a, float64x2_t b){
|
||||
memcpy(a,&b,2*sizeof(double));
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
// Nils: Vset untested; not used currently in Grid at all;
|
||||
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
|
||||
struct Vset{
|
||||
// Complex float
|
||||
inline float32x4_t operator()(Grid::ComplexF *a){
|
||||
float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
|
||||
return vld1q_f32(tmp);
|
||||
}
|
||||
// Complex double
|
||||
inline float64x2_t operator()(Grid::ComplexD *a){
|
||||
double tmp[2]={a[0].imag(),a[0].real()};
|
||||
return vld1q_f64(tmp);
|
||||
}
|
||||
// Real float
|
||||
inline float32x4_t operator()(float *a){
|
||||
float tmp[4]={a[3],a[2],a[1],a[0]};
|
||||
return vld1q_f32(tmp);
|
||||
}
|
||||
// Real double
|
||||
inline float64x2_t operator()(double *a){
|
||||
double tmp[2]={a[1],a[0]};
|
||||
return vld1q_f64(tmp);
|
||||
}
|
||||
// Integer
|
||||
inline uint32x4_t operator()(Integer *a){
|
||||
return vld1q_dup_u32(a);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
//General form must generate error if compiled
|
||||
inline Out_type operator()(In_type in){
|
||||
printf("Error, using wrong Reduce function\n");
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
return vaddq_f32(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||
return vaddq_f64(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
|
||||
return vaddq_u32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
return vsubq_f32(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||
return vsubq_f64(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
|
||||
return vsubq_u32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
struct MultRealPart{
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
float32x4_t re = vtrn1q_f32(a, a);
|
||||
return vmulq_f32(re, b);
|
||||
}
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||
float64x2_t re = vzip1q_f64(a, a);
|
||||
return vmulq_f64(re, b);
|
||||
}
|
||||
};
|
||||
|
||||
struct MaddRealPart{
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
|
||||
float32x4_t re = vtrn1q_f32(a, a);
|
||||
return vfmaq_f32(c, re, b);
|
||||
}
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
|
||||
float64x2_t re = vzip1q_f64(a, a);
|
||||
return vfmaq_f64(c, re, b);
|
||||
}
|
||||
};
|
||||
|
||||
struct Div{
|
||||
// Real float
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
return vdivq_f32(a, b);
|
||||
}
|
||||
// Real double
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||
return vdivq_f64(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
|
||||
float32x4_t r0, r1, r2, r3, r4;
|
||||
|
||||
// a = ar ai Ar Ai
|
||||
// b = br bi Br Bi
|
||||
// collect real/imag part, negate bi and Bi
|
||||
r0 = vtrn1q_f32(b, b); // br br Br Br
|
||||
r1 = vnegq_f32(b); // -br -bi -Br -Bi
|
||||
r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi
|
||||
|
||||
// the fun part
|
||||
r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ...
|
||||
r4 = vrev64q_f32(r3); // -bi*ai bi*ar ...
|
||||
|
||||
// fma(a,b,c) = a+b*c
|
||||
return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ...
|
||||
|
||||
// no fma, use mul and add
|
||||
// float32x4_t r5;
|
||||
// r5 = vmulq_f32(r0, a);
|
||||
// return vaddq_f32(r4, r5);
|
||||
}
|
||||
// Complex double
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||
|
||||
float64x2_t r0, r1, r2, r3, r4;
|
||||
|
||||
// b = br bi
|
||||
// collect real/imag part, negate bi
|
||||
r0 = vtrn1q_f64(b, b); // br br
|
||||
r1 = vnegq_f64(b); // -br -bi
|
||||
r2 = vtrn2q_f64(b, r1); // bi -bi
|
||||
|
||||
// the fun part
|
||||
r3 = vmulq_f64(r2, a); // bi*ar -bi*ai
|
||||
r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar
|
||||
|
||||
// fma(a,b,c) = a+b*c
|
||||
return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi
|
||||
|
||||
// no fma, use mul and add
|
||||
// float64x2_t r5;
|
||||
// r5 = vmulq_f64(r0, a);
|
||||
// return vaddq_f64(r4, r5);
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
// Real float
|
||||
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
|
||||
//return vaddq_f32(vmulq_f32(b,c),a);
|
||||
return vfmaq_f32(a, b, c);
|
||||
}
|
||||
inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
|
||||
//return vaddq_f64(vmulq_f64(b,c),a);
|
||||
return vfmaq_f64(a, b, c);
|
||||
}
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
return vmulq_f32(a,b);
|
||||
}
|
||||
// Real double
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||
return vmulq_f64(a,b);
|
||||
}
|
||||
// Integer
|
||||
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
|
||||
return vmulq_u32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
struct Conj{
|
||||
// Complex single
|
||||
inline float32x4_t operator()(float32x4_t in){
|
||||
// ar ai br bi -> ar -ai br -bi
|
||||
float32x4_t r0, r1;
|
||||
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
|
||||
return vtrn1q_f32(in, r1); // ar -ai br -bi
|
||||
}
|
||||
// Complex double
|
||||
inline float64x2_t operator()(float64x2_t in){
|
||||
|
||||
float64x2_t r0, r1;
|
||||
r0 = vextq_f64(in, in, 1); // ai ar
|
||||
r1 = vnegq_f64(r0); // -ai -ar
|
||||
return vextq_f64(r0, r1, 1); // ar -ai
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||
// ar ai br bi -> ai -ar ai -br
|
||||
float32x4_t r0, r1;
|
||||
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||
r1 = vrev64q_f32(in); // ai ar bi br
|
||||
return vtrn1q_f32(r1, r0); // ar -ai br -bi
|
||||
}
|
||||
//Complex double
|
||||
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
|
||||
// a ib -> b -ia
|
||||
float64x2_t tmp;
|
||||
tmp = vnegq_f64(in);
|
||||
return vextq_f64(in, tmp, 1);
|
||||
}
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||
// ar ai br bi -> -ai ar -bi br
|
||||
float32x4_t r0, r1;
|
||||
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
|
||||
return vtrn1q_f32(r1, in); // -ai ar -bi br
|
||||
}
|
||||
//Complex double
|
||||
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
|
||||
// a ib -> -b ia
|
||||
float64x2_t tmp;
|
||||
tmp = vnegq_f64(in);
|
||||
return vextq_f64(tmp, in, 1);
|
||||
}
|
||||
};
|
||||
|
||||
struct Permute{
|
||||
|
||||
static inline float32x4_t Permute0(float32x4_t in){ // N:ok
|
||||
// AB CD -> CD AB
|
||||
return vextq_f32(in, in, 2);
|
||||
};
|
||||
static inline float32x4_t Permute1(float32x4_t in){ // N:ok
|
||||
// AB CD -> BA DC
|
||||
return vrev64q_f32(in);
|
||||
};
|
||||
static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
|
||||
return in;
|
||||
};
|
||||
static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline float64x2_t Permute0(float64x2_t in){ // N:ok
|
||||
// AB -> BA
|
||||
return vextq_f64(in, in, 1);
|
||||
};
|
||||
static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
|
||||
return in;
|
||||
};
|
||||
static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
|
||||
return in;
|
||||
};
|
||||
static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
|
||||
switch(n){
|
||||
case 0: // AB CD -> AB CD
|
||||
return tRotate<0>(in);
|
||||
break;
|
||||
case 1: // AB CD -> BC DA
|
||||
return tRotate<1>(in);
|
||||
break;
|
||||
case 2: // AB CD -> CD AB
|
||||
return tRotate<2>(in);
|
||||
break;
|
||||
case 3: // AB CD -> DA BC
|
||||
return tRotate<3>(in);
|
||||
break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
|
||||
switch(n){
|
||||
case 0: // AB -> AB
|
||||
return tRotate<0>(in);
|
||||
break;
|
||||
case 1: // AB -> BA
|
||||
return tRotate<1>(in);
|
||||
break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
|
||||
template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
|
||||
|
||||
};
|
||||
|
||||
struct PrecisionChange {
|
||||
|
||||
static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
|
||||
float16x4_t h = vcvt_f16_f32(a);
|
||||
return vcvt_high_f16_f32(h, b);
|
||||
}
|
||||
static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
|
||||
sb = vcvt_high_f32_f16(h);
|
||||
// there is no direct conversion from lower float32x4_t to float64x2_t
|
||||
// vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
|
||||
// float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
|
||||
// workaround for clang
|
||||
uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
|
||||
float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
|
||||
sa = vcvt_high_f32_f16(h1);
|
||||
}
|
||||
static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
|
||||
float32x2_t s = vcvt_f32_f64(a);
|
||||
return vcvt_high_f32_f64(s, b);
|
||||
|
||||
}
|
||||
static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
|
||||
b = vcvt_high_f64_f32(s);
|
||||
// there is no direct conversion from lower float32x4_t to float64x2_t
|
||||
float32x4_t s1 = vextq_f32(s, s, 2);
|
||||
a = vcvt_high_f64_f32(s1);
|
||||
|
||||
}
|
||||
static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
|
||||
float32x4_t s1 = DtoS(a, b);
|
||||
float32x4_t s2 = DtoS(c, d);
|
||||
return StoH(s1, s2);
|
||||
}
|
||||
static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
|
||||
float32x4_t s1, s2;
|
||||
HtoS(h, s1, s2);
|
||||
StoD(s1, a, b);
|
||||
StoD(s2, c, d);
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Exchange support
|
||||
|
||||
struct Exchange{
|
||||
static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||
// in1: ABCD -> out1: ABEF
|
||||
// in2: EFGH -> out2: CDGH
|
||||
|
||||
// z: CDAB
|
||||
float32x4_t z = vextq_f32(in1, in1, 2);
|
||||
// out1: ABEF
|
||||
out1 = vextq_f32(z, in2, 2);
|
||||
|
||||
// z: GHEF
|
||||
z = vextq_f32(in2, in2, 2);
|
||||
// out2: CDGH
|
||||
out2 = vextq_f32(in1, z, 2);
|
||||
};
|
||||
|
||||
static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||
// in1: ABCD -> out1: AECG
|
||||
// in2: EFGH -> out2: BFDH
|
||||
out1 = vtrn1q_f32(in1, in2);
|
||||
out2 = vtrn2q_f32(in1, in2);
|
||||
};
|
||||
static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
// double precision
|
||||
static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
|
||||
// in1: AB -> out1: AC
|
||||
// in2: CD -> out2: BD
|
||||
out1 = vzip1q_f64(in1, in2);
|
||||
out2 = vzip2q_f64(in1, in2);
|
||||
};
|
||||
static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
|
||||
float32x4_t v1; // two complex
|
||||
v1 = Optimization::Permute::Permute0(in);
|
||||
v1 = vaddq_f32(v1,in);
|
||||
u128f conv; conv.v=v1;
|
||||
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
|
||||
return vaddvq_f32(in);
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
|
||||
u128d conv; conv.v = in;
|
||||
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
|
||||
return vaddvq_f64(in);
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
|
||||
return vaddvq_u32(in);
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
|
||||
// typedef Optimization::vech SIMD_Htype; // Reduced precision type
|
||||
typedef float16x8_t SIMD_Htype; // Half precision type
|
||||
typedef float32x4_t SIMD_Ftype; // Single precision type
|
||||
typedef float64x2_t SIMD_Dtype; // Double precision type
|
||||
typedef uint32x4_t SIMD_Itype; // Integer type
|
||||
|
||||
inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities
|
||||
inline void prefetch_HINT_T0(const char *ptr){};
|
||||
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
|
||||
|
||||
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Div DivSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
}
|
||||
|
619
Grid/simd/Grid_qpx.h
Normal file
619
Grid/simd/Grid_qpx.h
Normal file
@ -0,0 +1,619 @@
|
||||
/*******************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Grid_qpx.h
|
||||
|
||||
Copyright (C) 2016
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Antonin Portelli <antonin.portelli@me.com>
|
||||
Andrew Lawson <andrew.lawson1991@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
******************************************************************************/
|
||||
|
||||
#ifndef GEN_SIMD_WIDTH
|
||||
#define GEN_SIMD_WIDTH 32u
|
||||
#endif
|
||||
#include "Grid_generic_types.h" // Definitions for simulated integer SIMD.
|
||||
|
||||
namespace Grid {
|
||||
|
||||
#ifdef QPX
|
||||
#include <spi/include/kernel/location.h>
|
||||
#include <spi/include/l1p/types.h>
|
||||
#include <hwi/include/bqc/l1p_mmio.h>
|
||||
#include <hwi/include/bqc/A2_inlines.h>
|
||||
#endif
|
||||
|
||||
namespace Optimization {
|
||||
typedef struct
|
||||
{
|
||||
float v0,v1,v2,v3;
|
||||
} vector4float;
|
||||
|
||||
inline std::ostream & operator<<(std::ostream& stream, const vector4double a)
|
||||
{
|
||||
stream << "{"<<vec_extract(a,0)<<","<<vec_extract(a,1)<<","<<vec_extract(a,2)<<","<<vec_extract(a,3)<<"}";
|
||||
return stream;
|
||||
};
|
||||
|
||||
inline std::ostream & operator<<(std::ostream& stream, const vector4float a)
|
||||
{
|
||||
stream << "{"<< a.v0 <<","<< a.v1 <<","<< a.v2 <<","<< a.v3 <<"}";
|
||||
return stream;
|
||||
};
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
inline vector4float operator()(float a, float b){
|
||||
return (vector4float){a, b, a, b};
|
||||
}
|
||||
// Real float
|
||||
inline vector4float operator()(float a){
|
||||
return (vector4float){a, a, a, a};
|
||||
}
|
||||
//Complex double
|
||||
inline vector4double operator()(double a, double b){
|
||||
return (vector4double){a, b, a, b};
|
||||
}
|
||||
//Real double
|
||||
inline vector4double operator()(double a){
|
||||
return (vector4double){a, a, a, a};
|
||||
}
|
||||
//Integer
|
||||
inline veci operator()(Integer a){
|
||||
veci out;
|
||||
|
||||
VECTOR_FOR(i, W<Integer>::r, 1)
|
||||
{
|
||||
out.v[i] = a;
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(vector4double a, float *f){
|
||||
vec_st(a, 0, f);
|
||||
}
|
||||
|
||||
inline void operator()(vector4double a, vector4float &f){
|
||||
vec_st(a, 0, (float *)(&f));
|
||||
}
|
||||
|
||||
inline void operator()(vector4float a, float *f){
|
||||
f[0] = a.v0;
|
||||
f[1] = a.v1;
|
||||
f[2] = a.v2;
|
||||
f[3] = a.v3;
|
||||
}
|
||||
|
||||
//Double
|
||||
inline void operator()(vector4double a, double *d){
|
||||
vec_st(a, 0, d);
|
||||
}
|
||||
|
||||
//Integer
|
||||
inline void operator()(veci a, Integer *i){
|
||||
*((veci *)i) = a;
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstream{
|
||||
//Float
|
||||
inline void operator()(float *f, vector4double a){
|
||||
vec_st(a, 0, f);
|
||||
}
|
||||
|
||||
inline void operator()(vector4float f, vector4double a){
|
||||
vec_st(a, 0, (float *)(&f));
|
||||
}
|
||||
|
||||
inline void operator()(float *f, vector4float a){
|
||||
f[0] = a.v0;
|
||||
f[1] = a.v1;
|
||||
f[2] = a.v2;
|
||||
f[3] = a.v3;
|
||||
}
|
||||
//Double
|
||||
inline void operator()(double *d, vector4double a){
|
||||
vec_st(a, 0, d);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct Vset{
|
||||
// Complex float
|
||||
inline vector4float operator()(Grid::ComplexF *a){
|
||||
return (vector4float){a[0].real(), a[0].imag(), a[1].real(), a[1].imag()};
|
||||
}
|
||||
// Complex double
|
||||
inline vector4double operator()(Grid::ComplexD *a){
|
||||
return vec_ld(0, (double *)a);
|
||||
}
|
||||
|
||||
// Real float
|
||||
inline vector4float operator()(float *a){
|
||||
return (vector4float){a[0], a[1], a[2], a[3]};
|
||||
}
|
||||
|
||||
inline vector4double operator()(vector4float a){
|
||||
return vec_ld(0, (float *)(&a));
|
||||
}
|
||||
|
||||
// Real double
|
||||
inline vector4double operator()(double *a){
|
||||
return vec_ld(0, a);
|
||||
}
|
||||
// Integer
|
||||
inline veci operator()(Integer *a){
|
||||
veci out;
|
||||
|
||||
out = *((veci *)a);
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
//General form must generate error if compiled
|
||||
inline Out_type operator()(In_type in){
|
||||
printf("Error, using wrong Reduce function\n");
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
|
||||
#define FLOAT_WRAP_3(fn, pref)\
|
||||
pref vector4float fn(vector4float a, vector4float b, vector4float c) \
|
||||
{\
|
||||
vector4double ad, bd, rd, cd; \
|
||||
vector4float r;\
|
||||
\
|
||||
ad = Vset()(a);\
|
||||
bd = Vset()(b);\
|
||||
cd = Vset()(c);\
|
||||
rd = fn(ad, bd, cd); \
|
||||
Vstore()(rd, r);\
|
||||
\
|
||||
return r;\
|
||||
}
|
||||
|
||||
#define FLOAT_WRAP_2(fn, pref)\
|
||||
pref vector4float fn(vector4float a, vector4float b)\
|
||||
{\
|
||||
vector4double ad, bd, rd;\
|
||||
vector4float r;\
|
||||
\
|
||||
ad = Vset()(a);\
|
||||
bd = Vset()(b);\
|
||||
rd = fn(ad, bd);\
|
||||
Vstore()(rd, r);\
|
||||
\
|
||||
return r;\
|
||||
}
|
||||
|
||||
#define FLOAT_WRAP_1(fn, pref)\
|
||||
pref vector4float fn(vector4float a)\
|
||||
{\
|
||||
vector4double ad, rd;\
|
||||
vector4float r;\
|
||||
\
|
||||
ad = Vset()(a);\
|
||||
rd = fn(ad);\
|
||||
Vstore()(rd, r);\
|
||||
\
|
||||
return r;\
|
||||
}
|
||||
|
||||
struct Sum{
|
||||
//Complex/Real double
|
||||
inline vector4double operator()(vector4double a, vector4double b){
|
||||
return vec_add(a, b);
|
||||
}
|
||||
|
||||
//Complex/Real float
|
||||
FLOAT_WRAP_2(operator(), inline)
|
||||
|
||||
//Integer
|
||||
inline veci operator()(veci a, veci b){
|
||||
veci out;
|
||||
|
||||
VECTOR_FOR(i, W<Integer>::r, 1)
|
||||
{
|
||||
out.v[i] = a.v[i] + b.v[i];
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real double
|
||||
inline vector4double operator()(vector4double a, vector4double b){
|
||||
return vec_sub(a, b);
|
||||
}
|
||||
|
||||
//Complex/Real float
|
||||
FLOAT_WRAP_2(operator(), inline)
|
||||
|
||||
//Integer
|
||||
inline veci operator()(veci a, veci b){
|
||||
veci out;
|
||||
|
||||
VECTOR_FOR(i, W<Integer>::r, 1)
|
||||
{
|
||||
out.v[i] = a.v[i] - b.v[i];
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct MultRealPart{
|
||||
// Complex double
|
||||
inline vector4double operator()(vector4double a, vector4double b){
|
||||
// return vec_xmul(b, a);
|
||||
return vec_xmul(a, b);
|
||||
}
|
||||
FLOAT_WRAP_2(operator(), inline)
|
||||
};
|
||||
struct MaddRealPart{
|
||||
// Complex double
|
||||
inline vector4double operator()(vector4double a, vector4double b,vector4double c){
|
||||
return vec_xmadd(a, b, c);
|
||||
}
|
||||
FLOAT_WRAP_3(operator(), inline)
|
||||
};
|
||||
struct MultComplex{
|
||||
// Complex double
|
||||
inline vector4double operator()(vector4double a, vector4double b){
|
||||
return vec_xxnpmadd(a, b, vec_xmul(b, a));
|
||||
}
|
||||
|
||||
// Complex float
|
||||
FLOAT_WRAP_2(operator(), inline)
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
// Real double
|
||||
inline vector4double operator()(vector4double a, vector4double b){
|
||||
return vec_mul(a, b);
|
||||
}
|
||||
|
||||
// Real float
|
||||
FLOAT_WRAP_2(operator(), inline)
|
||||
|
||||
// Integer
|
||||
inline veci operator()(veci a, veci b){
|
||||
veci out;
|
||||
|
||||
VECTOR_FOR(i, W<Integer>::r, 1)
|
||||
{
|
||||
out.v[i] = a.v[i]*b.v[i];
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct Div{
|
||||
// Real double
|
||||
inline vector4double operator()(vector4double a, vector4double b){
|
||||
return vec_swdiv(a, b);
|
||||
}
|
||||
|
||||
// Real float
|
||||
FLOAT_WRAP_2(operator(), inline)
|
||||
|
||||
// Integer
|
||||
inline veci operator()(veci a, veci b){
|
||||
veci out;
|
||||
|
||||
VECTOR_FOR(i, W<Integer>::r, 1)
|
||||
{
|
||||
out.v[i] = a.v[i]/b.v[i];
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
struct Conj{
|
||||
// Complex double
|
||||
inline vector4double operator()(vector4double v){
|
||||
return vec_mul(v, (vector4double){1., -1., 1., -1.});
|
||||
}
|
||||
|
||||
// Complex float
|
||||
FLOAT_WRAP_1(operator(), inline)
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex double
|
||||
inline vector4double operator()(vector4double v, vector4double ret){
|
||||
return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
|
||||
(vector4double){0., 0., 0., 0.});
|
||||
}
|
||||
|
||||
// Complex float
|
||||
FLOAT_WRAP_2(operator(), inline)
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex double
|
||||
inline vector4double operator()(vector4double v, vector4double ret){
|
||||
return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
|
||||
(vector4double){0., 0., 0., 0.});
|
||||
}
|
||||
|
||||
// Complex float
|
||||
FLOAT_WRAP_2(operator(), inline)
|
||||
};
|
||||
#define USE_FP16
|
||||
struct PrecisionChange {
|
||||
static inline vech StoH (const vector4float &a, const vector4float &b) {
|
||||
vech ret;
|
||||
std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl;
|
||||
assert(0);
|
||||
return ret;
|
||||
}
|
||||
static inline void HtoS (vech h, vector4float &sa, vector4float &sb) {
|
||||
std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl;
|
||||
assert(0);
|
||||
}
|
||||
static inline vector4float DtoS (vector4double a, vector4double b) {
|
||||
vector4float ret;
|
||||
std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl;
|
||||
assert(0);
|
||||
return ret;
|
||||
}
|
||||
static inline void StoD (vector4float s, vector4double &a, vector4double &b) {
|
||||
std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl;
|
||||
assert(0);
|
||||
}
|
||||
static inline vech DtoH (vector4double a, vector4double b,
|
||||
vector4double c, vector4double d) {
|
||||
vech ret;
|
||||
std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl;
|
||||
assert(0);
|
||||
return ret;
|
||||
}
|
||||
static inline void HtoD (vech h, vector4double &a, vector4double &b,
|
||||
vector4double &c, vector4double &d) {
|
||||
std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl;
|
||||
assert(0);
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Exchange support
|
||||
#define FLOAT_WRAP_EXCHANGE(fn) \
|
||||
static inline void fn(vector4float &out1, vector4float &out2, \
|
||||
vector4float in1, vector4float in2) \
|
||||
{ \
|
||||
vector4double out1d, out2d, in1d, in2d; \
|
||||
in1d = Vset()(in1); \
|
||||
in2d = Vset()(in2); \
|
||||
fn(out1d, out2d, in1d, in2d); \
|
||||
Vstore()(out1d, out1); \
|
||||
Vstore()(out2d, out2); \
|
||||
}
|
||||
|
||||
struct Exchange{
|
||||
|
||||
// double precision
|
||||
static inline void Exchange0(vector4double &out1, vector4double &out2,
|
||||
vector4double in1, vector4double in2) {
|
||||
out1 = vec_perm(in1, in2, vec_gpci(0145));
|
||||
out2 = vec_perm(in1, in2, vec_gpci(02367));
|
||||
}
|
||||
static inline void Exchange1(vector4double &out1, vector4double &out2,
|
||||
vector4double in1, vector4double in2) {
|
||||
out1 = vec_perm(in1, in2, vec_gpci(0426));
|
||||
out2 = vec_perm(in1, in2, vec_gpci(01537));
|
||||
}
|
||||
static inline void Exchange2(vector4double &out1, vector4double &out2,
|
||||
vector4double in1, vector4double in2) {
|
||||
assert(0);
|
||||
}
|
||||
static inline void Exchange3(vector4double &out1, vector4double &out2,
|
||||
vector4double in1, vector4double in2) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
// single precision
|
||||
FLOAT_WRAP_EXCHANGE(Exchange0);
|
||||
FLOAT_WRAP_EXCHANGE(Exchange1);
|
||||
FLOAT_WRAP_EXCHANGE(Exchange2);
|
||||
FLOAT_WRAP_EXCHANGE(Exchange3);
|
||||
};
|
||||
|
||||
struct Permute{
|
||||
//Complex double
|
||||
static inline vector4double Permute0(vector4double v){ //0123 -> 2301
|
||||
return vec_perm(v, v, vec_gpci(02301));
|
||||
};
|
||||
static inline vector4double Permute1(vector4double v){ //0123 -> 1032
|
||||
return vec_perm(v, v, vec_gpci(01032));
|
||||
};
|
||||
static inline vector4double Permute2(vector4double v){
|
||||
return v;
|
||||
};
|
||||
static inline vector4double Permute3(vector4double v){
|
||||
return v;
|
||||
};
|
||||
|
||||
// Complex float
|
||||
FLOAT_WRAP_1(Permute0, static inline)
|
||||
FLOAT_WRAP_1(Permute1, static inline)
|
||||
FLOAT_WRAP_1(Permute2, static inline)
|
||||
FLOAT_WRAP_1(Permute3, static inline)
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
template<int n> static inline vector4double tRotate(vector4double v){
|
||||
if ( n==1 ) return vec_perm(v, v, vec_gpci(01230));
|
||||
if ( n==2 ) return vec_perm(v, v, vec_gpci(02301));
|
||||
if ( n==3 ) return vec_perm(v, v, vec_gpci(03012));
|
||||
return v;
|
||||
};
|
||||
template<int n> static inline vector4float tRotate(vector4float a)
|
||||
{
|
||||
vector4double ad, rd;
|
||||
vector4float r;
|
||||
ad = Vset()(a);
|
||||
rd = tRotate<n>(ad);
|
||||
Vstore()(rd, r);
|
||||
return r;
|
||||
};
|
||||
|
||||
static inline vector4double rotate(vector4double v, int n){
|
||||
switch(n){
|
||||
case 0:
|
||||
return v;
|
||||
break;
|
||||
case 1:
|
||||
return tRotate<1>(v);
|
||||
break;
|
||||
case 2:
|
||||
return tRotate<2>(v);
|
||||
break;
|
||||
case 3:
|
||||
return tRotate<3>(v);
|
||||
break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
static inline vector4float rotate(vector4float v, int n){
|
||||
vector4double vd, rd;
|
||||
vector4float r;
|
||||
vd = Vset()(v);
|
||||
rd = rotate(vd, n);
|
||||
Vstore()(rd, r);
|
||||
return r;
|
||||
}
|
||||
};
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF
|
||||
Reduce<Grid::ComplexF, vector4float>::operator()(vector4float v) { //2 complex
|
||||
vector4float v1,v2;
|
||||
|
||||
v1 = Optimization::Permute::Permute0(v);
|
||||
v1 = Optimization::Sum()(v1, v);
|
||||
|
||||
return Grid::ComplexF(v1.v0, v1.v1);
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF
|
||||
Reduce<Grid::RealF, vector4float>::operator()(vector4float v){ //4 floats
|
||||
vector4float v1,v2;
|
||||
|
||||
v1 = Optimization::Permute::Permute0(v);
|
||||
v1 = Optimization::Sum()(v1, v);
|
||||
v2 = Optimization::Permute::Permute1(v1);
|
||||
v1 = Optimization::Sum()(v1, v2);
|
||||
|
||||
return v1.v0;
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD
|
||||
Reduce<Grid::ComplexD, vector4double>::operator()(vector4double v){ //2 complex
|
||||
vector4double v1;
|
||||
|
||||
v1 = Optimization::Permute::Permute0(v);
|
||||
v1 = vec_add(v1, v);
|
||||
|
||||
return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1));
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD
|
||||
Reduce<Grid::RealD, vector4double>::operator()(vector4double v){ //4 doubles
|
||||
vector4double v1,v2;
|
||||
|
||||
v1 = Optimization::Permute::Permute0(v);
|
||||
v1 = vec_add(v1, v);
|
||||
v2 = Optimization::Permute::Permute1(v1);
|
||||
v1 = vec_add(v1, v2);
|
||||
|
||||
return vec_extract(v1, 0);
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, veci>::operator()(veci in){
|
||||
Integer a = 0;
|
||||
for (unsigned int i = 0; i < W<Integer>::r; ++i)
|
||||
{
|
||||
a += in.v[i];
|
||||
}
|
||||
return a;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
typedef Optimization::vech SIMD_Htype; // Half precision type
|
||||
typedef Optimization::vector4float SIMD_Ftype; // Single precision type
|
||||
typedef vector4double SIMD_Dtype; // Double precision type
|
||||
typedef Optimization::veci SIMD_Itype; // Integer type
|
||||
|
||||
// prefetch utilities
|
||||
inline void v_prefetch0(int size, const char *ptr){};
|
||||
inline void prefetch_HINT_T0(const char *ptr){};
|
||||
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::Div DivSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
}
|
617
Grid/simd/Grid_sse4.h
Normal file
617
Grid/simd/Grid_sse4.h
Normal file
@ -0,0 +1,617 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Grid_sse4.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: neo <cossu@post.kek.jp>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
//----------------------------------------------------------------------
|
||||
/*! @file Grid_sse4.h
|
||||
@brief Optimization libraries for SSE4 instructions set
|
||||
|
||||
Using intrinsics
|
||||
*/
|
||||
// Time-stamp: <2015-06-16 23:27:54 neo>
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#include <pmmintrin.h>
|
||||
|
||||
namespace Grid {
|
||||
namespace Optimization {
|
||||
|
||||
template<class vtype>
|
||||
union uconv {
|
||||
__m128 f;
|
||||
vtype v;
|
||||
};
|
||||
|
||||
union u128f {
|
||||
__m128 v;
|
||||
float f[4];
|
||||
};
|
||||
union u128d {
|
||||
__m128d v;
|
||||
double f[2];
|
||||
};
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
inline __m128 operator()(float a, float b){
|
||||
return _mm_set_ps(b,a,b,a);
|
||||
}
|
||||
// Real float
|
||||
inline __m128 operator()(float a){
|
||||
return _mm_set_ps(a,a,a,a);
|
||||
}
|
||||
//Complex double
|
||||
inline __m128d operator()(double a, double b){
|
||||
return _mm_set_pd(b,a);
|
||||
}
|
||||
//Real double
|
||||
inline __m128d operator()(double a){
|
||||
return _mm_set_pd(a,a);
|
||||
}
|
||||
//Integer
|
||||
inline __m128i operator()(Integer a){
|
||||
return _mm_set1_epi32(a);
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(__m128 a, float* F){
|
||||
_mm_store_ps(F,a);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(__m128d a, double* D){
|
||||
_mm_store_pd(D,a);
|
||||
}
|
||||
//Integer
|
||||
inline void operator()(__m128i a, Integer* I){
|
||||
_mm_store_si128((__m128i *)I,a);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct Vstream{
|
||||
//Float
|
||||
inline void operator()(float * a, __m128 b){
|
||||
_mm_stream_ps(a,b);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(double * a, __m128d b){
|
||||
_mm_stream_pd(a,b);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
struct Vset{
|
||||
// Complex float
|
||||
inline __m128 operator()(Grid::ComplexF *a){
|
||||
return _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
|
||||
}
|
||||
// Complex double
|
||||
inline __m128d operator()(Grid::ComplexD *a){
|
||||
return _mm_set_pd(a[0].imag(),a[0].real());
|
||||
}
|
||||
// Real float
|
||||
inline __m128 operator()(float *a){
|
||||
return _mm_set_ps(a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
// Real double
|
||||
inline __m128d operator()(double *a){
|
||||
return _mm_set_pd(a[1],a[0]);
|
||||
}
|
||||
// Integer
|
||||
inline __m128i operator()(Integer *a){
|
||||
return _mm_set_epi32(a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
//General form must generate error if compiled
|
||||
inline Out_type operator()(In_type in){
|
||||
printf("Error, using wrong Reduce function\n");
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
inline __m128 operator()(__m128 a, __m128 b){
|
||||
return _mm_add_ps(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline __m128d operator()(__m128d a, __m128d b){
|
||||
return _mm_add_pd(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline __m128i operator()(__m128i a, __m128i b){
|
||||
return _mm_add_epi32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
inline __m128 operator()(__m128 a, __m128 b){
|
||||
return _mm_sub_ps(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline __m128d operator()(__m128d a, __m128d b){
|
||||
return _mm_sub_pd(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline __m128i operator()(__m128i a, __m128i b){
|
||||
return _mm_sub_epi32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
struct MultRealPart{
|
||||
inline __m128 operator()(__m128 a, __m128 b){
|
||||
__m128 ymm0;
|
||||
ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||
return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
}
|
||||
inline __m128d operator()(__m128d a, __m128d b){
|
||||
__m128d ymm0;
|
||||
ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
|
||||
return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
}
|
||||
};
|
||||
struct MaddRealPart{
|
||||
inline __m128 operator()(__m128 a, __m128 b, __m128 c){
|
||||
__m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||
return _mm_add_ps(_mm_mul_ps( ymm0, b),c);
|
||||
}
|
||||
inline __m128d operator()(__m128d a, __m128d b, __m128d c){
|
||||
__m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 );
|
||||
return _mm_add_pd(_mm_mul_pd( ymm0, b),c);
|
||||
}
|
||||
};
|
||||
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
inline __m128 operator()(__m128 a, __m128 b){
|
||||
__m128 ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
return _mm_addsub_ps(ymm0,ymm1);
|
||||
}
|
||||
// Complex double
|
||||
inline __m128d operator()(__m128d a, __m128d b){
|
||||
__m128d ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar,
|
||||
ymm0 = _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
ymm1 = _mm_shuffle_pd(b,b,0x1); // ymm1 <- br,bi b01
|
||||
ymm2 = _mm_shuffle_pd(a,a,0x3); // ymm2 <- ai,ai b11
|
||||
ymm1 = _mm_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
return _mm_addsub_pd(ymm0,ymm1);
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
|
||||
inline void mac(__m128 &a, __m128 b, __m128 c){
|
||||
a= _mm_add_ps(_mm_mul_ps(b,c),a);
|
||||
}
|
||||
|
||||
inline void mac(__m128d &a, __m128d b, __m128d c){
|
||||
a= _mm_add_pd(_mm_mul_pd(b,c),a);
|
||||
}
|
||||
|
||||
// Real float
|
||||
inline __m128 operator()(__m128 a, __m128 b){
|
||||
return _mm_mul_ps(a,b);
|
||||
}
|
||||
// Real double
|
||||
inline __m128d operator()(__m128d a, __m128d b){
|
||||
return _mm_mul_pd(a,b);
|
||||
}
|
||||
// Integer
|
||||
inline __m128i operator()(__m128i a, __m128i b){
|
||||
return _mm_mullo_epi32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
struct Div{
|
||||
// Real float
|
||||
inline __m128 operator()(__m128 a, __m128 b){
|
||||
return _mm_div_ps(a,b);
|
||||
}
|
||||
// Real double
|
||||
inline __m128d operator()(__m128d a, __m128d b){
|
||||
return _mm_div_pd(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct Conj{
|
||||
// Complex single
|
||||
inline __m128 operator()(__m128 in){
|
||||
return _mm_xor_ps(_mm_addsub_ps(_mm_setzero_ps(),in), _mm_set1_ps(-0.f));
|
||||
}
|
||||
// Complex double
|
||||
inline __m128d operator()(__m128d in){
|
||||
return _mm_xor_pd(_mm_addsub_pd(_mm_setzero_pd(),in), _mm_set1_pd(-0.f));//untested
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline __m128 operator()(__m128 in, __m128 ret){
|
||||
__m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
|
||||
return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
}
|
||||
//Complex double
|
||||
inline __m128d operator()(__m128d in, __m128d ret){
|
||||
__m128d tmp =_mm_addsub_pd(_mm_setzero_pd(),in); // r,-i
|
||||
return _mm_shuffle_pd(tmp,tmp,0x1);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m128 operator()(__m128 in, __m128 ret){
|
||||
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
|
||||
}
|
||||
//Complex double
|
||||
inline __m128d operator()(__m128d in, __m128d ret){
|
||||
__m128d tmp = _mm_shuffle_pd(in,in,0x1);
|
||||
return _mm_addsub_pd(_mm_setzero_pd(),tmp); // r,-i
|
||||
}
|
||||
};
|
||||
|
||||
struct Permute{
|
||||
|
||||
static inline __m128 Permute0(__m128 in){
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
|
||||
};
|
||||
static inline __m128 Permute1(__m128 in){
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
|
||||
};
|
||||
static inline __m128 Permute2(__m128 in){
|
||||
return in;
|
||||
};
|
||||
static inline __m128 Permute3(__m128 in){
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline __m128d Permute0(__m128d in){ //AB -> BA
|
||||
return _mm_shuffle_pd(in,in,0x1);
|
||||
};
|
||||
static inline __m128d Permute1(__m128d in){
|
||||
return in;
|
||||
};
|
||||
static inline __m128d Permute2(__m128d in){
|
||||
return in;
|
||||
};
|
||||
static inline __m128d Permute3(__m128d in){
|
||||
return in;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
#define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
|
||||
#define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
|
||||
|
||||
#ifdef SFW_FP16
|
||||
|
||||
struct Grid_half {
|
||||
Grid_half(){}
|
||||
Grid_half(uint16_t raw) : x(raw) {}
|
||||
uint16_t x;
|
||||
};
|
||||
union FP32 {
|
||||
unsigned int u;
|
||||
float f;
|
||||
};
|
||||
|
||||
// PAB - Lifted and adapted from Eigen, which is GPL V2
|
||||
inline float sfw_half_to_float(Grid_half h) {
|
||||
const FP32 magic = { 113 << 23 };
|
||||
const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
|
||||
FP32 o;
|
||||
o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
|
||||
unsigned int exp = shifted_exp & o.u; // just the exponent
|
||||
o.u += (127 - 15) << 23; // exponent adjust
|
||||
// handle exponent special cases
|
||||
if (exp == shifted_exp) { // Inf/NaN?
|
||||
o.u += (128 - 16) << 23; // extra exp adjust
|
||||
} else if (exp == 0) { // Zero/Denormal?
|
||||
o.u += 1 << 23; // extra exp adjust
|
||||
o.f -= magic.f; // renormalize
|
||||
}
|
||||
o.u |= (h.x & 0x8000) << 16; // sign bit
|
||||
return o.f;
|
||||
}
|
||||
inline Grid_half sfw_float_to_half(float ff) {
|
||||
FP32 f; f.f = ff;
|
||||
const FP32 f32infty = { 255 << 23 };
|
||||
const FP32 f16max = { (127 + 16) << 23 };
|
||||
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
|
||||
unsigned int sign_mask = 0x80000000u;
|
||||
Grid_half o;
|
||||
|
||||
o.x = static_cast<unsigned short>(0x0u);
|
||||
unsigned int sign = f.u & sign_mask;
|
||||
f.u ^= sign;
|
||||
// NOTE all the integer compares in this function can be safely
|
||||
// compiled into signed compares since all operands are below
|
||||
// 0x80000000. Important if you want fast straight SSE2 code
|
||||
// (since there's no unsigned PCMPGTD).
|
||||
if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
|
||||
o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
|
||||
} else { // (De)normalized number or zero
|
||||
if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
|
||||
// use a magic value to align our 10 mantissa bits at the bottom of
|
||||
// the float. as long as FP addition is round-to-nearest-even this
|
||||
// just works.
|
||||
f.f += denorm_magic.f;
|
||||
// and one integer subtract of the bias later, we have our final float!
|
||||
o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
|
||||
} else {
|
||||
unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
|
||||
|
||||
// update exponent, rounding bias part 1
|
||||
f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
|
||||
// rounding bias part 2
|
||||
f.u += mant_odd;
|
||||
// take the bits!
|
||||
o.x = static_cast<unsigned short>(f.u >> 13);
|
||||
}
|
||||
}
|
||||
o.x |= static_cast<unsigned short>(sign >> 16);
|
||||
return o;
|
||||
}
|
||||
static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) {
|
||||
__m128i ret=(__m128i)_mm_setzero_ps();
|
||||
float *fp = (float *)&f;
|
||||
Grid_half *hp = (Grid_half *)&ret;
|
||||
hp[0] = sfw_float_to_half(fp[0]);
|
||||
hp[1] = sfw_float_to_half(fp[1]);
|
||||
hp[2] = sfw_float_to_half(fp[2]);
|
||||
hp[3] = sfw_float_to_half(fp[3]);
|
||||
return ret;
|
||||
}
|
||||
static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) {
|
||||
__m128 ret=_mm_setzero_ps();
|
||||
float *fp = (float *)&ret;
|
||||
Grid_half *hp = (Grid_half *)&h;
|
||||
fp[0] = sfw_half_to_float(hp[0]);
|
||||
fp[1] = sfw_half_to_float(hp[1]);
|
||||
fp[2] = sfw_half_to_float(hp[2]);
|
||||
fp[3] = sfw_half_to_float(hp[3]);
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
#define Grid_mm_cvtps_ph _mm_cvtps_ph
|
||||
#define Grid_mm_cvtph_ps _mm_cvtph_ps
|
||||
#endif
|
||||
struct PrecisionChange {
|
||||
static inline __m128i StoH (__m128 a,__m128 b) {
|
||||
__m128i ha = Grid_mm_cvtps_ph(a,0);
|
||||
__m128i hb = Grid_mm_cvtps_ph(b,0);
|
||||
__m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
return h;
|
||||
}
|
||||
static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) {
|
||||
sa = Grid_mm_cvtph_ps(h,0);
|
||||
h = (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2);
|
||||
sb = Grid_mm_cvtph_ps(h,0);
|
||||
}
|
||||
static inline __m128 DtoS (__m128d a,__m128d b) {
|
||||
__m128 sa = _mm_cvtpd_ps(a);
|
||||
__m128 sb = _mm_cvtpd_ps(b);
|
||||
__m128 s = _mm_shuffle_ps(sa,sb,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
return s;
|
||||
}
|
||||
static inline void StoD (__m128 s,__m128d &a,__m128d &b) {
|
||||
a = _mm_cvtps_pd(s);
|
||||
s = (__m128)_my_alignr_epi32((__m128i)s,(__m128i)s,2);
|
||||
b = _mm_cvtps_pd(s);
|
||||
}
|
||||
static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) {
|
||||
__m128 sa,sb;
|
||||
sa = DtoS(a,b);
|
||||
sb = DtoS(c,d);
|
||||
return StoH(sa,sb);
|
||||
}
|
||||
static inline void HtoD (__m128i h,__m128d &a,__m128d &b,__m128d &c,__m128d &d) {
|
||||
__m128 sa,sb;
|
||||
HtoS(h,sa,sb);
|
||||
StoD(sa,a,b);
|
||||
StoD(sb,c,d);
|
||||
}
|
||||
};
|
||||
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/
|
||||
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/
|
||||
out1= _mm_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
out2= _mm_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
};
|
||||
static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
|
||||
static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
out1= _mm_shuffle_pd(in1,in2,0x0);
|
||||
out2= _mm_shuffle_pd(in1,in2,0x3);
|
||||
};
|
||||
static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static inline __m128 rotate(__m128 in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
case 2: return tRotate<2>(in);break;
|
||||
case 3: return tRotate<3>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
static inline __m128d rotate(__m128d in,int n){
|
||||
switch(n){
|
||||
case 0: return tRotate<0>(in);break;
|
||||
case 1: return tRotate<1>(in);break;
|
||||
default: assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
template<int n> static inline __m128 tRotate(__m128 in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); };
|
||||
template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); };
|
||||
|
||||
};
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
|
||||
__m128 v1; // two complex
|
||||
v1= Optimization::Permute::Permute0(in);
|
||||
v1= _mm_add_ps(v1,in);
|
||||
u128f conv; conv.v=v1;
|
||||
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
|
||||
__m128 v1,v2; // quad single
|
||||
v1= Optimization::Permute::Permute0(in);
|
||||
v1= _mm_add_ps(v1,in);
|
||||
v2= Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm_add_ps(v1,v2);
|
||||
u128f conv; conv.v=v1;
|
||||
return conv.f[0];
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, __m128d>::operator()(__m128d in){
|
||||
u128d conv; conv.v = in;
|
||||
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
|
||||
__m128d v1;
|
||||
v1 = Optimization::Permute::Permute0(in);
|
||||
v1 = _mm_add_pd(v1,in);
|
||||
u128d conv; conv.v = v1;
|
||||
return conv.f[0];
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
|
||||
__m128i v1 = _mm_hadd_epi32(in, in);
|
||||
__m128i v2 = _mm_hadd_epi32(v1, v1);
|
||||
return _mm_cvtsi128_si32(v2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
|
||||
typedef __m128i SIMD_Htype; // Single precision type
|
||||
typedef __m128 SIMD_Ftype; // Single precision type
|
||||
typedef __m128d SIMD_Dtype; // Double precision type
|
||||
typedef __m128i SIMD_Itype; // Integer type
|
||||
|
||||
// prefetch utilities
|
||||
inline void v_prefetch0(int size, const char *ptr){};
|
||||
inline void prefetch_HINT_T0(const char *ptr){
|
||||
_mm_prefetch(ptr,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
|
||||
|
||||
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Div DivSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
}
|
868
Grid/simd/Grid_vector_types.h
Normal file
868
Grid/simd/Grid_vector_types.h
Normal file
@ -0,0 +1,868 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Grid_vector_type.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: neo <cossu@post.kek.jp>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
//---------------------------------------------------------------------------
|
||||
/*! @file Grid_vector_types.h
|
||||
@brief Defines templated class Grid_simd to deal with inner vector types
|
||||
*/
|
||||
// Time-stamp: <2015-07-10 17:45:33 neo>
|
||||
//---------------------------------------------------------------------------
|
||||
#ifndef GRID_VECTOR_TYPES
|
||||
#define GRID_VECTOR_TYPES
|
||||
|
||||
#ifdef GEN
|
||||
#include "Grid_generic.h"
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
#include "Grid_sse4.h"
|
||||
#endif
|
||||
#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
|
||||
#include "Grid_avx.h"
|
||||
#endif
|
||||
#if defined AVX512
|
||||
#include "Grid_avx512.h"
|
||||
#endif
|
||||
#if defined IMCI
|
||||
#include "Grid_imci.h"
|
||||
#endif
|
||||
#ifdef NEONV8
|
||||
#include "Grid_neon.h"
|
||||
#endif
|
||||
#if defined QPX
|
||||
#include "Grid_qpx.h"
|
||||
#endif
|
||||
|
||||
#include "l1p.h"
|
||||
|
||||
namespace Grid {
|
||||
|
||||
//////////////////////////////////////
|
||||
// To take the floating point type of real/complex type
|
||||
//////////////////////////////////////
|
||||
template <typename T>
|
||||
struct RealPart {
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T>
|
||||
struct RealPart<std::complex<T> > {
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
//////////////////////////////////////
|
||||
// demote a vector to real type
|
||||
//////////////////////////////////////
|
||||
// type alias used to simplify the syntax of std::enable_if
|
||||
template <typename T> using Invoke = typename T::type;
|
||||
template <typename Condition, typename ReturnType> using EnableIf = Invoke<std::enable_if<Condition::value, ReturnType> >;
|
||||
template <typename Condition, typename ReturnType> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Check for complexity with type traits
|
||||
template <typename T> struct is_complex : public std::false_type {};
|
||||
template <> struct is_complex<std::complex<double> > : public std::true_type {};
|
||||
template <> struct is_complex<std::complex<float> > : public std::true_type {};
|
||||
|
||||
template <typename T> using IfReal = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
|
||||
template <typename T> using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
|
||||
template <typename T> using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
|
||||
template <typename T1,typename T2> using IfSame = Invoke<std::enable_if<std::is_same<T1,T2>::value, int> >;
|
||||
|
||||
template <typename T> using IfNotReal = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
|
||||
template <typename T> using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
|
||||
template <typename T> using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
|
||||
template <typename T1,typename T2> using IfNotSame = Invoke<std::enable_if<!std::is_same<T1,T2>::value, int> >;
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Define the operation templates functors
|
||||
// general forms to allow for vsplat syntax
|
||||
// need explicit declaration of types when used since
|
||||
// clang cannot automatically determine the output type sometimes
|
||||
template <class Out, class Input1, class Input2, class Input3, class Operation>
|
||||
Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
|
||||
return op(src_1, src_2, src_3);
|
||||
}
|
||||
|
||||
template <class Out, class Input1, class Input2, class Operation>
|
||||
Out binary(Input1 src_1, Input2 src_2, Operation op) {
|
||||
return op(src_1, src_2);
|
||||
}
|
||||
|
||||
template <class Out, class Input, class Operation>
|
||||
Out unary(Input src, Operation op) {
|
||||
return op(src);
|
||||
}
|
||||
///////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
@brief Grid_simd class for the SIMD vector type operations
|
||||
*/
|
||||
template <class Scalar_type, class Vector_type>
|
||||
class Grid_simd {
|
||||
public:
|
||||
typedef typename RealPart<Scalar_type>::type Real;
|
||||
typedef Vector_type vector_type;
|
||||
typedef Scalar_type scalar_type;
|
||||
|
||||
typedef union conv_t_union {
|
||||
Vector_type v;
|
||||
Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
|
||||
conv_t_union(){};
|
||||
} conv_t;
|
||||
|
||||
Vector_type v;
|
||||
|
||||
static inline constexpr int Nsimd(void) {
|
||||
return sizeof(Vector_type) / sizeof(Scalar_type);
|
||||
}
|
||||
|
||||
Grid_simd &operator=(const Grid_simd &&rhs) {
|
||||
v = rhs.v;
|
||||
return *this;
|
||||
};
|
||||
Grid_simd &operator=(const Grid_simd &rhs) {
|
||||
v = rhs.v;
|
||||
return *this;
|
||||
}; // faster than not declaring it and leaving to the compiler
|
||||
Grid_simd() = default;
|
||||
Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps
|
||||
Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
|
||||
|
||||
/////////////////////////////
|
||||
// Constructors
|
||||
/////////////////////////////
|
||||
Grid_simd &operator=(Zero &z) {
|
||||
vzero(*this);
|
||||
return (*this);
|
||||
}
|
||||
|
||||
// Enable if complex type
|
||||
template <typename S = Scalar_type>
|
||||
Grid_simd(const typename std::enable_if<is_complex<S>::value, S>::type a) {
|
||||
vsplat(*this, a);
|
||||
};
|
||||
|
||||
Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// mac, mult, sub, add, adj
|
||||
///////////////////////////////////////////////
|
||||
|
||||
// FIXME -- alias this to an inline MAC struct.
|
||||
friend inline void mac(Grid_simd *__restrict__ y,
|
||||
const Grid_simd *__restrict__ a,
|
||||
const Grid_simd *__restrict__ x) {
|
||||
*y = (*a) * (*x) + (*y);
|
||||
};
|
||||
|
||||
friend inline void mult(Grid_simd *__restrict__ y,
|
||||
const Grid_simd *__restrict__ l,
|
||||
const Grid_simd *__restrict__ r) {
|
||||
*y = (*l) * (*r);
|
||||
}
|
||||
|
||||
friend inline void sub(Grid_simd *__restrict__ y,
|
||||
const Grid_simd *__restrict__ l,
|
||||
const Grid_simd *__restrict__ r) {
|
||||
*y = (*l) - (*r);
|
||||
}
|
||||
friend inline void add(Grid_simd *__restrict__ y,
|
||||
const Grid_simd *__restrict__ l,
|
||||
const Grid_simd *__restrict__ r) {
|
||||
*y = (*l) + (*r);
|
||||
}
|
||||
friend inline void mac(Grid_simd *__restrict__ y,
|
||||
const Scalar_type *__restrict__ a,
|
||||
const Grid_simd *__restrict__ x) {
|
||||
*y = (*a) * (*x) + (*y);
|
||||
};
|
||||
friend inline void mult(Grid_simd *__restrict__ y,
|
||||
const Scalar_type *__restrict__ l,
|
||||
const Grid_simd *__restrict__ r) {
|
||||
*y = (*l) * (*r);
|
||||
}
|
||||
friend inline void sub(Grid_simd *__restrict__ y,
|
||||
const Scalar_type *__restrict__ l,
|
||||
const Grid_simd *__restrict__ r) {
|
||||
*y = (*l) - (*r);
|
||||
}
|
||||
friend inline void add(Grid_simd *__restrict__ y,
|
||||
const Scalar_type *__restrict__ l,
|
||||
const Grid_simd *__restrict__ r) {
|
||||
*y = (*l) + (*r);
|
||||
}
|
||||
|
||||
friend inline void mac(Grid_simd *__restrict__ y,
|
||||
const Grid_simd *__restrict__ a,
|
||||
const Scalar_type *__restrict__ x) {
|
||||
*y = (*a) * (*x) + (*y);
|
||||
};
|
||||
friend inline void mult(Grid_simd *__restrict__ y,
|
||||
const Grid_simd *__restrict__ l,
|
||||
const Scalar_type *__restrict__ r) {
|
||||
*y = (*l) * (*r);
|
||||
}
|
||||
friend inline void sub(Grid_simd *__restrict__ y,
|
||||
const Grid_simd *__restrict__ l,
|
||||
const Scalar_type *__restrict__ r) {
|
||||
*y = (*l) - (*r);
|
||||
}
|
||||
friend inline void add(Grid_simd *__restrict__ y,
|
||||
const Grid_simd *__restrict__ l,
|
||||
const Scalar_type *__restrict__ r) {
|
||||
*y = (*l) + (*r);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// FIXME: gonna remove these load/store, get, set, prefetch
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
friend inline void vset(Grid_simd &ret, Scalar_type *a) {
|
||||
ret.v = unary<Vector_type>(a, VsetSIMD());
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// Vstore
|
||||
///////////////////////
|
||||
friend inline void vstore(const Grid_simd &ret, Scalar_type *a) {
|
||||
binary<void>(ret.v, (Real *)a, VstoreSIMD());
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// Vprefetch
|
||||
///////////////////////
|
||||
friend inline void vprefetch(const Grid_simd &v) {
|
||||
prefetch_HINT_T0((const char *)&v.v);
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// Reduce
|
||||
///////////////////////
|
||||
friend inline Scalar_type Reduce(const Grid_simd &in) {
|
||||
return unary<Scalar_type>(in.v, ReduceSIMD<Scalar_type, Vector_type>());
|
||||
}
|
||||
|
||||
////////////////////////////
|
||||
// operator scalar * simd
|
||||
////////////////////////////
|
||||
friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
|
||||
Grid_simd va;
|
||||
vsplat(va, a);
|
||||
return va * b;
|
||||
}
|
||||
friend inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
//////////////////////////////////
|
||||
// Divides
|
||||
//////////////////////////////////
|
||||
friend inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) {
|
||||
Grid_simd va;
|
||||
vsplat(va, a);
|
||||
return va / b;
|
||||
}
|
||||
friend inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) {
|
||||
Grid_simd va;
|
||||
vsplat(va, a);
|
||||
return b / a;
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// Unary negation
|
||||
///////////////////////
|
||||
friend inline Grid_simd operator-(const Grid_simd &r) {
|
||||
Grid_simd ret;
|
||||
vzero(ret);
|
||||
ret = ret - r;
|
||||
return ret;
|
||||
}
|
||||
// *=,+=,-= operators
|
||||
inline Grid_simd &operator*=(const Grid_simd &r) {
|
||||
*this = (*this) * r;
|
||||
return *this;
|
||||
// return (*this)*r; ?
|
||||
}
|
||||
inline Grid_simd &operator+=(const Grid_simd &r) {
|
||||
*this = *this + r;
|
||||
return *this;
|
||||
}
|
||||
inline Grid_simd &operator-=(const Grid_simd &r) {
|
||||
*this = *this - r;
|
||||
return *this;
|
||||
}
|
||||
|
||||
///////////////////////////////////////
|
||||
// Not all functions are supported
|
||||
// through SIMD and must breakout to
|
||||
// scalar type and back again. This
|
||||
// provides support
|
||||
///////////////////////////////////////
|
||||
|
||||
template <class functor>
|
||||
friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
|
||||
Grid_simd ret;
|
||||
Grid_simd::conv_t conv;
|
||||
Grid_simd::scalar_type s;
|
||||
|
||||
conv.v = v.v;
|
||||
for (int i = 0; i < Nsimd(); i++) {
|
||||
s = conv.s[i];
|
||||
conv.s[i] = func(s);
|
||||
}
|
||||
ret.v = conv.v;
|
||||
return ret;
|
||||
}
|
||||
template <class functor>
|
||||
friend inline Grid_simd SimdApplyBinop(const functor &func,
|
||||
const Grid_simd &x,
|
||||
const Grid_simd &y) {
|
||||
Grid_simd ret;
|
||||
Grid_simd::conv_t cx;
|
||||
Grid_simd::conv_t cy;
|
||||
Grid_simd::scalar_type sx,sy;
|
||||
|
||||
cx.v = x.v;
|
||||
cy.v = y.v;
|
||||
for (int i = 0; i < Nsimd(); i++) {
|
||||
sx = cx.s[i];
|
||||
sy = cy.s[i];
|
||||
cx.s[i] = func(sx,sy);
|
||||
}
|
||||
ret.v = cx.v;
|
||||
return ret;
|
||||
}
|
||||
///////////////////////
|
||||
// Exchange
|
||||
// Al Ah , Bl Bh -> Al Bl Ah,Bh
|
||||
///////////////////////
|
||||
friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
|
||||
{
|
||||
if (n==3) {
|
||||
Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
|
||||
} else if(n==2) {
|
||||
Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
|
||||
} else if(n==1) {
|
||||
Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
|
||||
} else if(n==0) {
|
||||
Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
|
||||
}
|
||||
}
|
||||
friend inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){
|
||||
Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
|
||||
}
|
||||
friend inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){
|
||||
Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
|
||||
}
|
||||
friend inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){
|
||||
Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
|
||||
}
|
||||
friend inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){
|
||||
Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General permute; assumes vector length is same across
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
// add the vector width as a template param for BG/Q for example
|
||||
////////////////////////////////////////////////////////////////////
|
||||
friend inline void permute0(Grid_simd &y, Grid_simd b) {
|
||||
y.v = Optimization::Permute::Permute0(b.v);
|
||||
}
|
||||
friend inline void permute1(Grid_simd &y, Grid_simd b) {
|
||||
y.v = Optimization::Permute::Permute1(b.v);
|
||||
}
|
||||
friend inline void permute2(Grid_simd &y, Grid_simd b) {
|
||||
y.v = Optimization::Permute::Permute2(b.v);
|
||||
}
|
||||
friend inline void permute3(Grid_simd &y, Grid_simd b) {
|
||||
y.v = Optimization::Permute::Permute3(b.v);
|
||||
}
|
||||
friend inline void permute(Grid_simd &y, Grid_simd b, int perm) {
|
||||
if (perm & RotateBit) {
|
||||
int dist = perm & 0xF;
|
||||
y = rotate(b, dist);
|
||||
return;
|
||||
}
|
||||
else if(perm==3) permute3(y, b);
|
||||
else if(perm==2) permute2(y, b);
|
||||
else if(perm==1) permute1(y, b);
|
||||
else if(perm==0) permute0(y, b);
|
||||
}
|
||||
|
||||
///////////////////////////////
|
||||
// Getting single lanes
|
||||
///////////////////////////////
|
||||
inline Scalar_type getlane(int lane) {
|
||||
return ((Scalar_type*)&v)[lane];
|
||||
}
|
||||
|
||||
inline void putlane(const Scalar_type &S, int lane){
|
||||
((Scalar_type*)&v)[lane] = S;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}; // end of Grid_simd class definition
|
||||
|
||||
inline void permute(ComplexD &y,ComplexD b, int perm) { y=b; }
|
||||
inline void permute(ComplexF &y,ComplexF b, int perm) { y=b; }
|
||||
inline void permute(RealD &y,RealD b, int perm) { y=b; }
|
||||
inline void permute(RealF &y,RealF b, int perm) { y=b; }
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General rotate
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class S, class V, IfNotComplex<S> = 0>
|
||||
inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
|
||||
nrot = nrot % Grid_simd<S, V>::Nsimd();
|
||||
Grid_simd<S, V> ret;
|
||||
ret.v = Optimization::Rotate::rotate(b.v, nrot);
|
||||
return ret;
|
||||
}
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
|
||||
nrot = nrot % Grid_simd<S, V>::Nsimd();
|
||||
Grid_simd<S, V> ret;
|
||||
ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
|
||||
return ret;
|
||||
}
|
||||
template <class S, class V, IfNotComplex<S> =0>
|
||||
inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
|
||||
{
|
||||
nrot = nrot % Grid_simd<S,V>::Nsimd();
|
||||
ret.v = Optimization::Rotate::rotate(b.v,nrot);
|
||||
}
|
||||
template <class S, class V, IfComplex<S> =0>
|
||||
inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
|
||||
{
|
||||
nrot = nrot % Grid_simd<S,V>::Nsimd();
|
||||
ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
|
||||
}
|
||||
|
||||
template <class S, class V>
|
||||
inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
|
||||
S* typepun =(S*) &src;
|
||||
vsplat(ret,typepun[lane]);
|
||||
}
|
||||
template <class S, class V, IfComplex<S> =0>
|
||||
inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
|
||||
S* typepun =(S*) &src;
|
||||
ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
|
||||
}
|
||||
|
||||
|
||||
|
||||
///////////////////////
|
||||
// Splat
|
||||
///////////////////////
|
||||
|
||||
// this is only for the complex version
|
||||
template <class S, class V, IfComplex<S> = 0, class ABtype>
|
||||
inline void vsplat(Grid_simd<S, V> &ret, ABtype a, ABtype b) {
|
||||
ret.v = binary<V>(a, b, VsplatSIMD());
|
||||
}
|
||||
|
||||
// overload if complex
|
||||
template <class S, class V>
|
||||
inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
|
||||
vsplat(ret, real(c), imag(c));
|
||||
}
|
||||
template <class S, class V>
|
||||
inline void rsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
|
||||
vsplat(ret, real(c), real(c));
|
||||
}
|
||||
|
||||
// if real fill with a, if complex fill with a in the real part (first function
|
||||
// above)
|
||||
template <class S, class V>
|
||||
inline void vsplat(Grid_simd<S, V> &ret, NotEnableIf<is_complex<S>, S> a) {
|
||||
ret.v = unary<V>(a, VsplatSIMD());
|
||||
}
|
||||
//////////////////////////
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Initialise to 1,0,i for the correct types
|
||||
///////////////////////////////////////////////
|
||||
// For complex types
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline void vone(Grid_simd<S, V> &ret) {
|
||||
vsplat(ret, S(1.0, 0.0));
|
||||
}
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline void vzero(Grid_simd<S, V> &ret) {
|
||||
vsplat(ret, S(0.0, 0.0));
|
||||
} // use xor?
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline void vcomplex_i(Grid_simd<S, V> &ret) {
|
||||
vsplat(ret, S(0.0, 1.0));
|
||||
}
|
||||
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline void visign(Grid_simd<S, V> &ret) {
|
||||
vsplat(ret, S(1.0, -1.0));
|
||||
}
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline void vrsign(Grid_simd<S, V> &ret) {
|
||||
vsplat(ret, S(-1.0, 1.0));
|
||||
}
|
||||
|
||||
// if not complex overload here
|
||||
template <class S, class V, IfReal<S> = 0>
|
||||
inline void vone(Grid_simd<S, V> &ret) {
|
||||
vsplat(ret, S(1.0));
|
||||
}
|
||||
template <class S, class V, IfReal<S> = 0>
|
||||
inline void vzero(Grid_simd<S, V> &ret) {
|
||||
vsplat(ret, S(0.0));
|
||||
}
|
||||
|
||||
// For integral types
|
||||
template <class S, class V, IfInteger<S> = 0>
|
||||
inline void vone(Grid_simd<S, V> &ret) {
|
||||
vsplat(ret, 1);
|
||||
}
|
||||
template <class S, class V, IfInteger<S> = 0>
|
||||
inline void vzero(Grid_simd<S, V> &ret) {
|
||||
vsplat(ret, 0);
|
||||
}
|
||||
template <class S, class V, IfInteger<S> = 0>
|
||||
inline void vtrue(Grid_simd<S, V> &ret) {
|
||||
vsplat(ret, 0xFFFFFFFF);
|
||||
}
|
||||
template <class S, class V, IfInteger<S> = 0>
|
||||
inline void vfalse(Grid_simd<S, V> &ret) {
|
||||
vsplat(ret, 0);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline void zeroit(Grid_simd<S, V> &z) {
|
||||
vzero(z);
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// Vstream
|
||||
///////////////////////
|
||||
template <class S, class V, IfReal<S> = 0>
|
||||
inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
|
||||
binary<void>((S *)&out.v, in.v, VstreamSIMD());
|
||||
}
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
|
||||
typedef typename S::value_type T;
|
||||
binary<void>((T *)&out.v, in.v, VstreamSIMD());
|
||||
}
|
||||
template <class S, class V, IfInteger<S> = 0>
|
||||
inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
|
||||
out = in;
|
||||
}
|
||||
|
||||
////////////////////////////////////
|
||||
// Arithmetic operator overloads +,-,*
|
||||
////////////////////////////////////
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> operator+(Grid_simd<S, V> a, Grid_simd<S, V> b) {
|
||||
Grid_simd<S, V> ret;
|
||||
ret.v = binary<V>(a.v, b.v, SumSIMD());
|
||||
return ret;
|
||||
};
|
||||
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
|
||||
Grid_simd<S, V> ret;
|
||||
ret.v = binary<V>(a.v, b.v, SubSIMD());
|
||||
return ret;
|
||||
};
|
||||
|
||||
// Distinguish between complex types and others
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
|
||||
Grid_simd<S, V> ret;
|
||||
ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
|
||||
return ret;
|
||||
};
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
|
||||
Grid_simd<S, V> ret;
|
||||
ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
|
||||
return ret;
|
||||
};
|
||||
|
||||
|
||||
// Distinguish between complex types and others
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
|
||||
Grid_simd<S, V> ret;
|
||||
ret.v = binary<V>(a.v, b.v, MultComplexSIMD());
|
||||
return ret;
|
||||
};
|
||||
|
||||
// Real/Integer types
|
||||
template <class S, class V, IfNotComplex<S> = 0>
|
||||
inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
|
||||
Grid_simd<S, V> ret;
|
||||
ret.v = binary<V>(a.v, b.v, MultSIMD());
|
||||
return ret;
|
||||
};
|
||||
|
||||
// Distinguish between complex types and others
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
|
||||
typedef Grid_simd<S, V> simd;
|
||||
|
||||
simd ret;
|
||||
simd den;
|
||||
typename simd::conv_t conv;
|
||||
|
||||
ret = a * conjugate(b) ;
|
||||
den = b * conjugate(b) ;
|
||||
|
||||
|
||||
auto real_den = toReal(den);
|
||||
|
||||
ret.v=binary<V>(ret.v, real_den.v, DivSIMD());
|
||||
|
||||
return ret;
|
||||
};
|
||||
|
||||
// Real/Integer types
|
||||
template <class S, class V, IfNotComplex<S> = 0>
|
||||
inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
|
||||
Grid_simd<S, V> ret;
|
||||
ret.v = binary<V>(a.v, b.v, DivSIMD());
|
||||
return ret;
|
||||
};
|
||||
|
||||
///////////////////////
|
||||
// Conjugate
|
||||
///////////////////////
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
|
||||
Grid_simd<S, V> ret;
|
||||
ret.v = unary<V>(in.v, ConjSIMD());
|
||||
return ret;
|
||||
}
|
||||
template <class S, class V, IfNotComplex<S> = 0>
|
||||
inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
|
||||
return in; // for real objects
|
||||
}
|
||||
// Suppress adj for integer types... // odd; why conjugate above but not adj??
|
||||
template <class S, class V, IfNotInteger<S> = 0>
|
||||
inline Grid_simd<S, V> adj(const Grid_simd<S, V> &in) {
|
||||
return conjugate(in);
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// timesMinusI
|
||||
///////////////////////
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline void timesMinusI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
|
||||
ret.v = binary<V>(in.v, ret.v, TimesMinusISIMD());
|
||||
}
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
|
||||
Grid_simd<S, V> ret;
|
||||
timesMinusI(ret, in);
|
||||
return ret;
|
||||
}
|
||||
template <class S, class V, IfNotComplex<S> = 0>
|
||||
inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
|
||||
return in;
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// timesI
|
||||
///////////////////////
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline void timesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
|
||||
ret.v = binary<V>(in.v, ret.v, TimesISIMD());
|
||||
}
|
||||
template <class S, class V, IfComplex<S> = 0>
|
||||
inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
|
||||
Grid_simd<S, V> ret;
|
||||
timesI(ret, in);
|
||||
return ret;
|
||||
}
|
||||
template <class S, class V, IfNotComplex<S> = 0>
|
||||
inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
|
||||
return in;
|
||||
}
|
||||
|
||||
/////////////////////
|
||||
// Inner, outer
|
||||
/////////////////////
|
||||
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> innerProduct(const Grid_simd<S, V> &l,
|
||||
const Grid_simd<S, V> &r) {
|
||||
return conjugate(l) * r;
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> outerProduct(const Grid_simd<S, V> &l,
|
||||
const Grid_simd<S, V> &r) {
|
||||
return l * conjugate(r);
|
||||
}
|
||||
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> trace(const Grid_simd<S, V> &arg) {
|
||||
return arg;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// copy/splat complex real parts into real;
|
||||
// insert real into complex and zero imag;
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
// real = toReal( complex )
|
||||
template <class S, class V, IfReal<S> = 0>
|
||||
inline Grid_simd<S, V> toReal(const Grid_simd<std::complex<S>, V> &in) {
|
||||
typedef Grid_simd<S, V> simd;
|
||||
simd ret;
|
||||
typename simd::conv_t conv;
|
||||
conv.v = in.v; // copy the vector content (bytewise)
|
||||
for (int i = 0; i < simd::Nsimd(); i += 2) {
|
||||
conv.s[i + 1] = conv.s[i]; // duplicate (r,r);(r,r);(r,r); etc...
|
||||
}
|
||||
ret.v = conv.v;
|
||||
return ret;
|
||||
}
|
||||
|
||||
// complex = toComplex( real )
|
||||
template <class R, class V, IfReal<R> = 0> // must be a real arg
|
||||
inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
|
||||
typedef Grid_simd<R, V> Rsimd;
|
||||
typedef Grid_simd<std::complex<R>, V> Csimd;
|
||||
typename Rsimd::conv_t conv; // address as real
|
||||
|
||||
conv.v = in.v;
|
||||
for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
|
||||
assert(conv.s[i + 1] == conv.s[i]);
|
||||
// trap any cases where real was not duplicated
|
||||
// indicating the SIMD grids of real and imag assignment did not correctly
|
||||
// match
|
||||
conv.s[i + 1] = 0.0; // zero imaginary parts
|
||||
}
|
||||
Csimd ret;
|
||||
ret.v = conv.v;
|
||||
return ret;
|
||||
}
|
||||
|
||||
///////////////////////////////
|
||||
// Define available types
|
||||
///////////////////////////////
|
||||
typedef Grid_simd<float, SIMD_Ftype> vRealF;
|
||||
typedef Grid_simd<double, SIMD_Dtype> vRealD;
|
||||
typedef Grid_simd<std::complex<float>, SIMD_Ftype> vComplexF;
|
||||
typedef Grid_simd<std::complex<double>, SIMD_Dtype> vComplexD;
|
||||
typedef Grid_simd<Integer, SIMD_Itype> vInteger;
|
||||
|
||||
// Half precision; no arithmetic support
|
||||
typedef Grid_simd<uint16_t, SIMD_Htype> vRealH;
|
||||
typedef Grid_simd<std::complex<uint16_t>, SIMD_Htype> vComplexH;
|
||||
|
||||
inline void precisionChange(vRealF *out,vRealD *in,int nvec)
|
||||
{
|
||||
assert((nvec&0x1)==0);
|
||||
for(int m=0;m*2<nvec;m++){
|
||||
int n=m*2;
|
||||
out[m].v=Optimization::PrecisionChange::DtoS(in[n].v,in[n+1].v);
|
||||
}
|
||||
}
|
||||
inline void precisionChange(vRealH *out,vRealD *in,int nvec)
|
||||
{
|
||||
assert((nvec&0x3)==0);
|
||||
for(int m=0;m*4<nvec;m++){
|
||||
int n=m*4;
|
||||
out[m].v=Optimization::PrecisionChange::DtoH(in[n].v,in[n+1].v,in[n+2].v,in[n+3].v);
|
||||
}
|
||||
}
|
||||
inline void precisionChange(vRealH *out,vRealF *in,int nvec)
|
||||
{
|
||||
assert((nvec&0x1)==0);
|
||||
for(int m=0;m*2<nvec;m++){
|
||||
int n=m*2;
|
||||
out[m].v=Optimization::PrecisionChange::StoH(in[n].v,in[n+1].v);
|
||||
}
|
||||
}
|
||||
inline void precisionChange(vRealD *out,vRealF *in,int nvec)
|
||||
{
|
||||
assert((nvec&0x1)==0);
|
||||
for(int m=0;m*2<nvec;m++){
|
||||
int n=m*2;
|
||||
Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v);
|
||||
}
|
||||
}
|
||||
inline void precisionChange(vRealD *out,vRealH *in,int nvec)
|
||||
{
|
||||
assert((nvec&0x3)==0);
|
||||
for(int m=0;m*4<nvec;m++){
|
||||
int n=m*4;
|
||||
Optimization::PrecisionChange::HtoD(in[m].v,out[n].v,out[n+1].v,out[n+2].v,out[n+3].v);
|
||||
}
|
||||
}
|
||||
inline void precisionChange(vRealF *out,vRealH *in,int nvec)
|
||||
{
|
||||
assert((nvec&0x1)==0);
|
||||
for(int m=0;m*2<nvec;m++){
|
||||
int n=m*2;
|
||||
Optimization::PrecisionChange::HtoS(in[m].v,out[n].v,out[n+1].v);
|
||||
}
|
||||
}
|
||||
inline void precisionChange(vComplexF *out,vComplexD *in,int nvec){ precisionChange((vRealF *)out,(vRealD *)in,nvec);}
|
||||
inline void precisionChange(vComplexH *out,vComplexD *in,int nvec){ precisionChange((vRealH *)out,(vRealD *)in,nvec);}
|
||||
inline void precisionChange(vComplexH *out,vComplexF *in,int nvec){ precisionChange((vRealH *)out,(vRealF *)in,nvec);}
|
||||
inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionChange((vRealD *)out,(vRealF *)in,nvec);}
|
||||
inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
|
||||
inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
|
||||
|
||||
// Check our vector types are of an appropriate size.
|
||||
#if defined QPX
|
||||
static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
|
||||
static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
|
||||
#else
|
||||
static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
|
||||
static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////
|
||||
// Some traits to recognise the types
|
||||
/////////////////////////////////////////
|
||||
template <typename T>
|
||||
struct is_simd : public std::false_type {};
|
||||
template <> struct is_simd<vRealF> : public std::true_type {};
|
||||
template <> struct is_simd<vRealD> : public std::true_type {};
|
||||
template <> struct is_simd<vComplexF> : public std::true_type {};
|
||||
template <> struct is_simd<vComplexD> : public std::true_type {};
|
||||
template <> struct is_simd<vInteger> : public std::true_type {};
|
||||
|
||||
template <typename T> using IfSimd = Invoke<std::enable_if<is_simd<T>::value, int> >;
|
||||
template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
|
||||
}
|
||||
|
||||
#endif
|
223
Grid/simd/Grid_vector_unops.h
Normal file
223
Grid/simd/Grid_vector_unops.h
Normal file
@ -0,0 +1,223 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Grid_vector_unops.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: neo <cossu@post.kek.jp>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_VECTOR_UNOPS
|
||||
#define GRID_VECTOR_UNOPS
|
||||
|
||||
#include <cmath>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
template <class scalar>
|
||||
struct SqrtRealFunctor {
|
||||
scalar operator()(const scalar &a) const { return sqrt(real(a)); }
|
||||
};
|
||||
|
||||
template <class scalar>
|
||||
struct RSqrtRealFunctor {
|
||||
scalar operator()(const scalar &a) const {
|
||||
return scalar(1.0 / sqrt(real(a)));
|
||||
}
|
||||
};
|
||||
|
||||
template <class scalar>
|
||||
struct CosRealFunctor {
|
||||
scalar operator()(const scalar &a) const { return cos(real(a)); }
|
||||
};
|
||||
|
||||
template <class scalar>
|
||||
struct SinRealFunctor {
|
||||
scalar operator()(const scalar &a) const { return sin(real(a)); }
|
||||
};
|
||||
|
||||
template <class scalar>
|
||||
struct AcosRealFunctor {
|
||||
scalar operator()(const scalar &a) const { return acos(real(a)); }
|
||||
};
|
||||
|
||||
template <class scalar>
|
||||
struct AsinRealFunctor {
|
||||
scalar operator()(const scalar &a) const { return asin(real(a)); }
|
||||
};
|
||||
template <class scalar>
|
||||
struct LogRealFunctor {
|
||||
scalar operator()(const scalar &a) const { return log(real(a)); }
|
||||
};
|
||||
template <class scalar>
|
||||
struct ExpFunctor {
|
||||
scalar operator()(const scalar &a) const { return exp(a); }
|
||||
};
|
||||
template <class scalar>
|
||||
struct NotFunctor {
|
||||
scalar operator()(const scalar &a) const { return (!a); }
|
||||
};
|
||||
template <class scalar>
|
||||
struct AbsRealFunctor {
|
||||
scalar operator()(const scalar &a) const { return std::abs(real(a)); }
|
||||
};
|
||||
template <class scalar>
|
||||
struct PowRealFunctor {
|
||||
double y;
|
||||
PowRealFunctor(double _y) : y(_y){};
|
||||
scalar operator()(const scalar &a) const { return pow(real(a), y); }
|
||||
};
|
||||
|
||||
template <class scalar>
|
||||
struct ModIntFunctor {
|
||||
Integer y;
|
||||
ModIntFunctor(Integer _y) : y(_y){};
|
||||
scalar operator()(const scalar &a) const { return Integer(a) % y; }
|
||||
};
|
||||
|
||||
template <class scalar>
|
||||
struct DivIntFunctor {
|
||||
Integer y;
|
||||
DivIntFunctor(Integer _y) : y(_y){};
|
||||
scalar operator()(const scalar &a) const { return Integer(a) / y; }
|
||||
};
|
||||
|
||||
template <class scalar>
|
||||
struct RealFunctor {
|
||||
scalar operator()(const scalar &a) const { return std::real(a); }
|
||||
};
|
||||
template <class scalar>
|
||||
struct ImagFunctor {
|
||||
scalar operator()(const scalar &a) const { return std::imag(a); }
|
||||
};
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> real(const Grid_simd<S, V> &r) {
|
||||
return SimdApply(RealFunctor<S>(), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> imag(const Grid_simd<S, V> &r) {
|
||||
return SimdApply(ImagFunctor<S>(), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> sqrt(const Grid_simd<S, V> &r) {
|
||||
return SimdApply(SqrtRealFunctor<S>(), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> rsqrt(const Grid_simd<S, V> &r) {
|
||||
return SimdApply(RSqrtRealFunctor<S>(), r);
|
||||
}
|
||||
template <class Scalar>
|
||||
inline Scalar rsqrt(const Scalar &r) {
|
||||
return (RSqrtRealFunctor<Scalar>(), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
|
||||
return SimdApply(CosRealFunctor<S>(), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> sin(const Grid_simd<S, V> &r) {
|
||||
return SimdApply(SinRealFunctor<S>(), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> acos(const Grid_simd<S, V> &r) {
|
||||
return SimdApply(AcosRealFunctor<S>(), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> asin(const Grid_simd<S, V> &r) {
|
||||
return SimdApply(AsinRealFunctor<S>(), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> log(const Grid_simd<S, V> &r) {
|
||||
return SimdApply(LogRealFunctor<S>(), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
|
||||
return SimdApply(AbsRealFunctor<S>(), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
|
||||
return SimdApply(ExpFunctor<S>(), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {
|
||||
return SimdApply(NotFunctor<S>(), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> pow(const Grid_simd<S, V> &r, double y) {
|
||||
return SimdApply(PowRealFunctor<S>(y), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> mod(const Grid_simd<S, V> &r, Integer y) {
|
||||
return SimdApply(ModIntFunctor<S>(y), r);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> div(const Grid_simd<S, V> &r, Integer y) {
|
||||
return SimdApply(DivIntFunctor<S>(y), r);
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Allows us to assign into **conformable** real vectors from complex
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
template <class scalar>
|
||||
struct AndFunctor {
|
||||
scalar operator()(const scalar &x, const scalar &y) const { return x & y; }
|
||||
};
|
||||
template <class scalar>
|
||||
struct OrFunctor {
|
||||
scalar operator()(const scalar &x, const scalar &y) const { return x | y; }
|
||||
};
|
||||
template <class scalar>
|
||||
struct AndAndFunctor {
|
||||
scalar operator()(const scalar &x, const scalar &y) const { return x && y; }
|
||||
};
|
||||
template <class scalar>
|
||||
struct OrOrFunctor {
|
||||
scalar operator()(const scalar &x, const scalar &y) const { return x || y; }
|
||||
};
|
||||
|
||||
////////////////////////////////
|
||||
// Calls to simd binop functors
|
||||
////////////////////////////////
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> operator&(const Grid_simd<S, V> &x,
|
||||
const Grid_simd<S, V> &y) {
|
||||
return SimdApplyBinop(AndFunctor<S>(), x, y);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> operator&&(const Grid_simd<S, V> &x,
|
||||
const Grid_simd<S, V> &y) {
|
||||
return SimdApplyBinop(AndAndFunctor<S>(), x, y);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> operator|(const Grid_simd<S, V> &x,
|
||||
const Grid_simd<S, V> &y) {
|
||||
return SimdApplyBinop(OrFunctor<S>(), x, y);
|
||||
}
|
||||
template <class S, class V>
|
||||
inline Grid_simd<S, V> operator||(const Grid_simd<S, V> &x,
|
||||
const Grid_simd<S, V> &y) {
|
||||
return SimdApplyBinop(OrOrFunctor<S>(), x, y);
|
||||
}
|
||||
}
|
||||
#endif
|
598
Grid/simd/IBM_qpx.h
Normal file
598
Grid/simd/IBM_qpx.h
Normal file
@ -0,0 +1,598 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/BGQQPX.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_BGQ_QPX_H
|
||||
#define GRID_ASM_BGQ_QPX_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/*********************************************************
|
||||
* Register definitions
|
||||
*********************************************************/
|
||||
#define psi_00 0
|
||||
#define psi_01 1
|
||||
#define psi_02 2
|
||||
|
||||
#define psi_10 3
|
||||
#define psi_11 4
|
||||
#define psi_12 5
|
||||
|
||||
#define psi_20 6
|
||||
#define psi_21 7
|
||||
#define psi_22 8
|
||||
|
||||
#define psi_30 9
|
||||
#define psi_31 10
|
||||
#define psi_32 11
|
||||
|
||||
#define Chi_00 12
|
||||
#define Chi_01 13
|
||||
#define Chi_02 14
|
||||
|
||||
#define Chi_10 15
|
||||
#define Chi_11 16
|
||||
#define Chi_12 17
|
||||
|
||||
#define UChi_00 18
|
||||
#define UChi_01 19
|
||||
#define UChi_02 20
|
||||
|
||||
#define UChi_10 21
|
||||
#define UChi_11 22
|
||||
#define UChi_12 23
|
||||
|
||||
#define U0 24
|
||||
#define U1 25
|
||||
#define U2 26
|
||||
#define one 27
|
||||
#define perm_reg 28
|
||||
|
||||
#define REP %%r16
|
||||
#define IMM %%r17
|
||||
#define pREP %r16
|
||||
#define pIMM %r17
|
||||
|
||||
#define PPC_INST_DCBTLS 0x7c00014c
|
||||
#define PPC_INST_DCBLC 0x7c00030c
|
||||
#define __PPC_CT(t) (((t) & 0x0f) << 21)
|
||||
#define ___PPC_RA(a) (((a) & 0x1f) << 16)
|
||||
#define ___PPC_RB(b) (((b) & 0x1f) << 11)
|
||||
|
||||
#define LOCK_SET ".long (" HASH(PPC_INST_DCBTLS) "|" HASH(___PPC_RB(16)) ")\n"
|
||||
#define LOCK_CLEAR ".long (" HASH(PPC_INST_DCBLC) "|" HASH(___PPC_RB(16)) ")\n"
|
||||
|
||||
/*Alias regs for incoming fourspinor on neighbour site*/
|
||||
#define Chi_20 UChi_00
|
||||
#define Chi_21 UChi_01
|
||||
#define Chi_22 UChi_02
|
||||
#define Chi_30 UChi_10
|
||||
#define Chi_31 UChi_11
|
||||
#define Chi_32 UChi_12
|
||||
|
||||
/*********************************************************
|
||||
* Architectural macros
|
||||
*********************************************************/
|
||||
#define HASHit(A) #A
|
||||
#define HASH(A) HASHit(A)
|
||||
#define LOAD64(A,ptr)
|
||||
|
||||
|
||||
#define MASK_REGS /*NOOP ON BGQ*/
|
||||
#define PF_GAUGE(A) /*NOOP ON BGQ*/
|
||||
#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
|
||||
#define PREFETCH_CHIMU(base) /*NOOP ON BGQ*/
|
||||
|
||||
#define VLOADf(OFF,PTR,DEST) "qvlfsx " #DEST "," #PTR "," #OFF " ;\n"
|
||||
#define VLOADuf(OFF,PTR,DEST) "qvlfsux " #DEST "," #PTR "," #OFF " ;\n"
|
||||
#define VSTOREf(OFF,PTR,SRC) "qvstfsx " #SRC "," #PTR "," #OFF " ;\n"
|
||||
#define VSTOREuf(OFF,PTR,SRC) "qvstfsux " #SRC "," #PTR "," #OFF " ;\n"
|
||||
#define VSPLATf(A,B,DEST) "qvlfcsxa " #DEST "," #A "," #B ";\n"
|
||||
#define VSIZEf (16)
|
||||
|
||||
#define VPERMIi(p) "qvgpci " #p ", 1217;\n"
|
||||
#define VPERMi(A,p) "qvfperm " #A "," #A "," #A "," #p ";\n"
|
||||
#define VPERMI(p) VPERMIi(p)
|
||||
#define VPERM(A,p) VPERMi(A,p)
|
||||
|
||||
#define VLOADd(OFF,PTR,DEST) "qvlfdx " #DEST "," #PTR "," #OFF " ;\n"
|
||||
#define VLOADud(OFF,PTR,DEST) "qvlfdux " #DEST "," #PTR "," #OFF " ;\n"
|
||||
#define VSTOREd(OFF,PTR,SRC) "qvstfdx " #SRC "," #PTR "," #OFF " ;\n"
|
||||
#define VSTOREud(OFF,PTR,SRC) "qvstfdux " #SRC "," #PTR "," #OFF " ;\n"
|
||||
#define VSPLATd(A,B,DEST) "qvlfcdxa " #DEST "," #A "," #B ";\n"
|
||||
#define VSIZEd (32)
|
||||
|
||||
// QPX manual ordering QRT comes first (dest)
|
||||
#define VZEROi(DEST) "qvfset " #DEST "; \n qvfsub " #DEST "," #DEST "," #DEST ";\n"
|
||||
#define VONEi(DEST) "qvfset " #DEST "; \n"
|
||||
#define VMOVi(DEST,A) "qvfmr " #DEST "," #A ";\n"
|
||||
#define VADDi(DEST,A,B) "qvfadd " #DEST "," #A "," #B ";\n"
|
||||
#define VSUBi(DEST,A,B) "qvfsub " #DEST "," #A "," #B ";\n"
|
||||
#define VMULi(DEST,A,B) "qvfmul " #DEST "," #A "," #B ";\n"
|
||||
#define VMUL_RR_RIi(DEST,A,B) "qvfxmul " #DEST "," #A "," #B ";\n"
|
||||
#define VMADDi(DEST,A,B,C) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
#define VMADD_RR_RIi(DEST,A,B,C) "qvfxmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
#define VMADD_MII_IRi(DEST,A,B,C) "qvfxxnpmadd " #DEST "," #B "," #A ","#C ";\n"
|
||||
#define VMADD_II_MIRi(DEST,A,B,C) "qvfxxcpnmadd " #DEST "," #B "," #A ","#C ";\n"
|
||||
|
||||
#define VZERO(C) VZEROi(C)
|
||||
#define VONE(C) VONEi(C)
|
||||
#define VMOV(C,A) VMOVi(C,A)
|
||||
#define VADD(A,B,C) VADDi(A,B,C)
|
||||
#define VSUB(A,B,C) VSUBi(A,B,C)
|
||||
#define VMUL(A,B,C) VMULi(A,B,C)
|
||||
#define VMUL_RR_RI(A,B,C) VMUL_RR_RIi(A,B,C)
|
||||
#define VMADD(A,B,C,D) VMADDi(A,B,C,D)
|
||||
#define VMADD_RR_RI(A,B,C,D) VMADD_RR_RIi(A,B,C,D)
|
||||
#define VMADD_MII_IR(A,B,C,D) VMADD_MII_IRi(A,B,C,D)
|
||||
#define VMADD_II_MIR(A,B,C,D) VMADD_II_MIRi(A,B,C,D)
|
||||
|
||||
/*********************************************************
|
||||
* Macro sequences encoding QCD
|
||||
*********************************************************/
|
||||
#define LOCK_GAUGE(dir) \
|
||||
{ \
|
||||
uint64_t byte_addr = (uint64_t)&U._odata[sU]; \
|
||||
int count = (sizeof(U._odata[0])+63)/64; \
|
||||
asm (" mtctr %0 \n" \
|
||||
" mr " HASH(REP) ", %1\n" \
|
||||
" li " HASH(IMM) ", 64\n" \
|
||||
"0:\n" \
|
||||
LOCK_SET \
|
||||
" add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \
|
||||
" bdnz 0b\n" \
|
||||
: : "b" (count), "b" (byte_addr) ); \
|
||||
}
|
||||
|
||||
#define UNLOCK_GAUGE(dir) \
|
||||
{ \
|
||||
uint64_t byte_addr = (uint64_t)&U._odata[sU]; \
|
||||
int count = (sizeof(U._odata[0])+63)/64; \
|
||||
asm (" mtctr %0 \n" \
|
||||
" mr " HASH(REP) ", %1\n" \
|
||||
" li " HASH(IMM) ", 64\n" \
|
||||
"0:\n" \
|
||||
LOCK_CLEAR \
|
||||
" add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \
|
||||
" bdnz 0b\n" \
|
||||
: : "b" (count), "b" (byte_addr) ); \
|
||||
}
|
||||
|
||||
#define ZERO_PSI \
|
||||
VZERO(psi_00) \
|
||||
VZERO(psi_01) \
|
||||
VZERO(psi_02) \
|
||||
VZERO(psi_10) \
|
||||
VZERO(psi_11) \
|
||||
VZERO(psi_12) \
|
||||
VZERO(psi_20) \
|
||||
VZERO(psi_21) \
|
||||
VZERO(psi_22) \
|
||||
VZERO(psi_30) \
|
||||
VZERO(psi_31) \
|
||||
VZERO(psi_32)
|
||||
|
||||
#define MULT_2SPIN_QPX_LSd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,16)
|
||||
#define MULT_2SPIN_QPX_LSf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,8)
|
||||
#define MULT_2SPIN_QPXd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,32)
|
||||
#define MULT_2SPIN_QPXf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,16)
|
||||
|
||||
#define MULT_2SPIN_QPX_INTERNAL(ptr,p,ULOAD,USKIP) { \
|
||||
uint64_t ub = ((uint64_t)ptr); \
|
||||
asm ( \
|
||||
ULOAD(%0,%3,U0) \
|
||||
ULOAD(%1,%3,U1) \
|
||||
ULOAD(%2,%3,U2) \
|
||||
VMUL_RR_RI(UChi_00,U0,Chi_00) \
|
||||
VMUL_RR_RI(UChi_01,U1,Chi_00) \
|
||||
VMUL_RR_RI(UChi_02,U2,Chi_00) \
|
||||
VMUL_RR_RI(UChi_10,U0,Chi_10) \
|
||||
VMUL_RR_RI(UChi_11,U1,Chi_10) \
|
||||
VMUL_RR_RI(UChi_12,U2,Chi_10) \
|
||||
VMADD_MII_IR(UChi_00,U0,Chi_00,UChi_00) \
|
||||
VMADD_MII_IR(UChi_01,U1,Chi_00,UChi_01) \
|
||||
VMADD_MII_IR(UChi_02,U2,Chi_00,UChi_02) \
|
||||
VMADD_MII_IR(UChi_10,U0,Chi_10,UChi_10) \
|
||||
VMADD_MII_IR(UChi_11,U1,Chi_10,UChi_11) \
|
||||
VMADD_MII_IR(UChi_12,U2,Chi_10,UChi_12) \
|
||||
: : "b" (0), "b" (USKIP*3), "b" (USKIP*6), "b" (ub )); \
|
||||
asm ( \
|
||||
ULOAD(%0,%3,U0) \
|
||||
ULOAD(%1,%3,U1) \
|
||||
ULOAD(%2,%3,U2) \
|
||||
VMADD_RR_RI(UChi_00,U0,Chi_01,UChi_00) \
|
||||
VMADD_RR_RI(UChi_01,U1,Chi_01,UChi_01) \
|
||||
VMADD_RR_RI(UChi_02,U2,Chi_01,UChi_02) \
|
||||
VMADD_RR_RI(UChi_10,U0,Chi_11,UChi_10) \
|
||||
VMADD_RR_RI(UChi_11,U1,Chi_11,UChi_11) \
|
||||
VMADD_RR_RI(UChi_12,U2,Chi_11,UChi_12) \
|
||||
VMADD_MII_IR(UChi_00,U0,Chi_01,UChi_00) \
|
||||
VMADD_MII_IR(UChi_01,U1,Chi_01,UChi_01) \
|
||||
VMADD_MII_IR(UChi_02,U2,Chi_01,UChi_02) \
|
||||
VMADD_MII_IR(UChi_10,U0,Chi_11,UChi_10) \
|
||||
VMADD_MII_IR(UChi_11,U1,Chi_11,UChi_11) \
|
||||
VMADD_MII_IR(UChi_12,U2,Chi_11,UChi_12) \
|
||||
: : "b" (USKIP*1), "b" (USKIP*4), "b" (USKIP*7), "b" (ub )); \
|
||||
asm ( \
|
||||
ULOAD(%0,%3,U0) \
|
||||
ULOAD(%1,%3,U1) \
|
||||
ULOAD(%2,%3,U2) \
|
||||
VMADD_RR_RI(UChi_00,U0,Chi_02,UChi_00) \
|
||||
VMADD_RR_RI(UChi_01,U1,Chi_02,UChi_01) \
|
||||
VMADD_RR_RI(UChi_02,U2,Chi_02,UChi_02) \
|
||||
VMADD_RR_RI(UChi_10,U0,Chi_12,UChi_10) \
|
||||
VMADD_RR_RI(UChi_11,U1,Chi_12,UChi_11) \
|
||||
VMADD_RR_RI(UChi_12,U2,Chi_12,UChi_12) \
|
||||
VMADD_MII_IR(UChi_00,U0,Chi_02,UChi_00) \
|
||||
VMADD_MII_IR(UChi_01,U1,Chi_02,UChi_01) \
|
||||
VMADD_MII_IR(UChi_02,U2,Chi_02,UChi_02) \
|
||||
VMADD_MII_IR(UChi_10,U0,Chi_12,UChi_10) \
|
||||
VMADD_MII_IR(UChi_11,U1,Chi_12,UChi_11) \
|
||||
VMADD_MII_IR(UChi_12,U2,Chi_12,UChi_12) \
|
||||
: : "b" (USKIP*2), "b" (USKIP*5), "b" (USKIP*8), "b" (ub )); \
|
||||
}
|
||||
|
||||
|
||||
#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
|
||||
#define SAVE_RESULT(base,basep) {\
|
||||
uint64_t ub = ((uint64_t)base) - (VSIZE); \
|
||||
asm("mr " HASH(REP) ", %0;\n" \
|
||||
"li " HASH(IMM) "," HASH(VSIZE)" ;\n" \
|
||||
VSTOREu(IMM,REP,psi_00) \
|
||||
VSTOREu(IMM,REP,psi_01) \
|
||||
VSTOREu(IMM,REP,psi_02) \
|
||||
VSTOREu(IMM,REP,psi_10) \
|
||||
VSTOREu(IMM,REP,psi_11) \
|
||||
VSTOREu(IMM,REP,psi_12) \
|
||||
VSTOREu(IMM,REP,psi_20) \
|
||||
VSTOREu(IMM,REP,psi_21) \
|
||||
VSTOREu(IMM,REP,psi_22) \
|
||||
VSTOREu(IMM,REP,psi_30) \
|
||||
VSTOREu(IMM,REP,psi_31) \
|
||||
VSTOREu(IMM,REP,psi_32) \
|
||||
: : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
*Annoying BG/Q loads with no immediat indexing and big performance hit
|
||||
*when second miss to a L1 line occurs
|
||||
*/
|
||||
#define LOAD_CHI(base) { \
|
||||
uint64_t ub = ((uint64_t)base) - (2*VSIZE); \
|
||||
asm("mr " HASH(REP) ",%0 ;\n" \
|
||||
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
|
||||
VLOADu(IMM,REP,Chi_00) \
|
||||
VLOADu(IMM,REP,Chi_02) \
|
||||
VLOADu(IMM,REP,Chi_11) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
ub = ((uint64_t)base) - VSIZE; \
|
||||
asm("mr " HASH(REP) ", %0;\n" \
|
||||
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
|
||||
VLOADu(IMM,REP,Chi_01) \
|
||||
VLOADu(IMM,REP,Chi_10) \
|
||||
VLOADu(IMM,REP,Chi_12) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
}
|
||||
|
||||
#define LOAD_CHIMU(base) { \
|
||||
uint64_t ub = ((uint64_t)base) - (2*VSIZE); \
|
||||
asm("mr " HASH(REP) ",%0;\n" \
|
||||
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
|
||||
VLOADu(IMM,REP,Chi_00) \
|
||||
VLOADu(IMM,REP,Chi_02) \
|
||||
VLOADu(IMM,REP,Chi_11) \
|
||||
VLOADu(IMM,REP,Chi_20) \
|
||||
VLOADu(IMM,REP,Chi_22) \
|
||||
VLOADu(IMM,REP,Chi_31) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
ub = ((uint64_t)base) - VSIZE; \
|
||||
asm("mr " HASH(REP) ", %0;\n" \
|
||||
"li " HASH(IMM) ", (2*" HASH(VSIZE) ");\n" \
|
||||
VLOADu(IMM,REP,Chi_01) \
|
||||
VLOADu(IMM,REP,Chi_10) \
|
||||
VLOADu(IMM,REP,Chi_12) \
|
||||
VLOADu(IMM,REP,Chi_21) \
|
||||
VLOADu(IMM,REP,Chi_30) \
|
||||
VLOADu(IMM,REP,Chi_32) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
}
|
||||
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
#define XP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_MII_IR(Chi_00,one,Chi_30,Chi_00) \
|
||||
VMADD_MII_IR(Chi_01,one,Chi_31,Chi_01) \
|
||||
VMADD_MII_IR(Chi_02,one,Chi_32,Chi_02) \
|
||||
VMADD_MII_IR(Chi_10,one,Chi_20,Chi_10) \
|
||||
VMADD_MII_IR(Chi_11,one,Chi_21,Chi_11) \
|
||||
VMADD_MII_IR(Chi_12,one,Chi_22,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_II_MIR(Chi_00,one,Chi_30,Chi_00) \
|
||||
VMADD_II_MIR(Chi_01,one,Chi_31,Chi_01) \
|
||||
VMADD_II_MIR(Chi_02,one,Chi_32,Chi_02) \
|
||||
VMADD_II_MIR(Chi_10,one,Chi_20,Chi_10) \
|
||||
VMADD_II_MIR(Chi_11,one,Chi_21,Chi_11) \
|
||||
VMADD_II_MIR(Chi_12,one,Chi_22,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
// hspin(0)=fspin(0)-fspin(3);
|
||||
// hspin(1)=fspin(1)+fspin(2);
|
||||
#define YP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VSUB(Chi_00,Chi_00,Chi_30) \
|
||||
VSUB(Chi_01,Chi_01,Chi_31) \
|
||||
VSUB(Chi_02,Chi_02,Chi_32) \
|
||||
VADD(Chi_10,Chi_10,Chi_20) \
|
||||
VADD(Chi_11,Chi_11,Chi_21) \
|
||||
VADD(Chi_12,Chi_12,Chi_22) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define YM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VADD(Chi_00,Chi_00,Chi_30) \
|
||||
VADD(Chi_01,Chi_01,Chi_31) \
|
||||
VADD(Chi_02,Chi_02,Chi_32) \
|
||||
VSUB(Chi_10,Chi_10,Chi_20) \
|
||||
VSUB(Chi_11,Chi_11,Chi_21) \
|
||||
VSUB(Chi_12,Chi_12,Chi_22) ); \
|
||||
}
|
||||
|
||||
/*Gz
|
||||
* 0 0 i 0 [0]+-i[2]
|
||||
* 0 0 0 -i [1]-+i[3]
|
||||
* -i 0 0 0
|
||||
* 0 i 0 0
|
||||
*/
|
||||
#define ZP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_MII_IR(Chi_00,one,Chi_20,Chi_00) \
|
||||
VMADD_MII_IR(Chi_01,one,Chi_21,Chi_01) \
|
||||
VMADD_MII_IR(Chi_02,one,Chi_22,Chi_02) \
|
||||
VMADD_II_MIR(Chi_10,one,Chi_30,Chi_10) \
|
||||
VMADD_II_MIR(Chi_11,one,Chi_31,Chi_11) \
|
||||
VMADD_II_MIR(Chi_12,one,Chi_32,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define ZM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_II_MIR(Chi_00,one,Chi_20,Chi_00) \
|
||||
VMADD_II_MIR(Chi_01,one,Chi_21,Chi_01) \
|
||||
VMADD_II_MIR(Chi_02,one,Chi_22,Chi_02) \
|
||||
VMADD_MII_IR(Chi_10,one,Chi_30,Chi_10) \
|
||||
VMADD_MII_IR(Chi_11,one,Chi_31,Chi_11) \
|
||||
VMADD_MII_IR(Chi_12,one,Chi_32,Chi_12) \
|
||||
); \
|
||||
}
|
||||
/*Gt
|
||||
* 0 0 1 0 [0]+-[2]
|
||||
* 0 0 0 1 [1]+-[3]
|
||||
* 1 0 0 0
|
||||
* 0 1 0 0
|
||||
*/
|
||||
#define TP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VADD(Chi_00,Chi_00,Chi_20) \
|
||||
VADD(Chi_01,Chi_01,Chi_21) \
|
||||
VADD(Chi_02,Chi_02,Chi_22) \
|
||||
VADD(Chi_10,Chi_10,Chi_30) \
|
||||
VADD(Chi_11,Chi_11,Chi_31) \
|
||||
VADD(Chi_12,Chi_12,Chi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define TM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VSUB(Chi_00,Chi_00,Chi_20) \
|
||||
VSUB(Chi_01,Chi_01,Chi_21) \
|
||||
VSUB(Chi_02,Chi_02,Chi_22) \
|
||||
VSUB(Chi_10,Chi_10,Chi_30) \
|
||||
VSUB(Chi_11,Chi_11,Chi_31) \
|
||||
VSUB(Chi_12,Chi_12,Chi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
/*
|
||||
fspin(0)=hspin(0);
|
||||
fspin(1)=hspin(1);
|
||||
fspin(2)=timesMinusI(hspin(1));
|
||||
fspin(3)=timesMinusI(hspin(0));
|
||||
|
||||
fspin(0)+=hspin(0);
|
||||
fspin(1)+=hspin(1);
|
||||
fspin(2)-=timesI(hspin(1));
|
||||
fspin(3)-=timesI(hspin(0));
|
||||
*/
|
||||
#define XP_RECON { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
|
||||
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
|
||||
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
|
||||
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
|
||||
VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \
|
||||
VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \
|
||||
VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \
|
||||
VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \
|
||||
VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \
|
||||
VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_RECON { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
|
||||
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
|
||||
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
|
||||
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
|
||||
VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \
|
||||
VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \
|
||||
VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \
|
||||
VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \
|
||||
VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \
|
||||
VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XP_RECON_ACCUM { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \
|
||||
VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \
|
||||
VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \
|
||||
VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \
|
||||
VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \
|
||||
VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_RECON_ACCUM { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \
|
||||
VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \
|
||||
VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \
|
||||
VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \
|
||||
VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \
|
||||
VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
// fspin(2)+=hspin(1);
|
||||
// fspin(3)-=hspin(0);
|
||||
#define YP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VADD(psi_20,psi_20,UChi_10) VADD(psi_21,psi_21,UChi_11) VADD(psi_22,psi_22,UChi_12) \
|
||||
VSUB(psi_30,psi_30,UChi_00) VSUB(psi_31,psi_31,UChi_01) VSUB(psi_32,psi_32,UChi_02) \
|
||||
);\
|
||||
}
|
||||
#define YM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VSUB(psi_20,psi_20,UChi_10) VSUB(psi_21,psi_21,UChi_11) VSUB(psi_22,psi_22,UChi_12) \
|
||||
VADD(psi_30,psi_30,UChi_00) VADD(psi_31,psi_31,UChi_01) VADD(psi_32,psi_32,UChi_02) \
|
||||
);\
|
||||
}
|
||||
|
||||
// fspin(2)-=timesI(hspin(0));
|
||||
// fspin(3)+=timesI(hspin(1));
|
||||
#define ZP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VMADD_II_MIR(psi_20,one,UChi_00,psi_20) \
|
||||
VMADD_II_MIR(psi_21,one,UChi_01,psi_21) \
|
||||
VMADD_II_MIR(psi_22,one,UChi_02,psi_22) \
|
||||
VMADD_MII_IR(psi_30,one,UChi_10,psi_30) \
|
||||
VMADD_MII_IR(psi_31,one,UChi_11,psi_31) \
|
||||
VMADD_MII_IR(psi_32,one,UChi_12,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
#define ZM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VMADD_MII_IR(psi_20,one,UChi_00,psi_20) \
|
||||
VMADD_MII_IR(psi_21,one,UChi_01,psi_21) \
|
||||
VMADD_MII_IR(psi_22,one,UChi_02,psi_22) \
|
||||
VMADD_II_MIR(psi_30,one,UChi_10,psi_30) \
|
||||
VMADD_II_MIR(psi_31,one,UChi_11,psi_31) \
|
||||
VMADD_II_MIR(psi_32,one,UChi_12,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
// fspin(2)+=hspin(0);
|
||||
// fspin(3)+=hspin(1);
|
||||
#define TP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VADD(psi_20,psi_20,UChi_00) VADD(psi_21,psi_21,UChi_01) VADD(psi_22,psi_22,UChi_02) \
|
||||
VADD(psi_30,psi_30,UChi_10) VADD(psi_31,psi_31,UChi_11) VADD(psi_32,psi_32,UChi_12) \
|
||||
);\
|
||||
}
|
||||
|
||||
#define TM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VSUB(psi_20,psi_20,UChi_00) VSUB(psi_21,psi_21,UChi_01) VSUB(psi_22,psi_22,UChi_02) \
|
||||
VSUB(psi_30,psi_30,UChi_10) VSUB(psi_31,psi_31,UChi_11) VSUB(psi_32,psi_32,UChi_12) \
|
||||
);\
|
||||
}
|
||||
|
||||
|
||||
#define ADD_RESULTi(PTR,pf) \
|
||||
LOAD_CHIMU(PTR) \
|
||||
asm( \
|
||||
VADD(psi_00,chi_00,psi_00) VADD(psi_01,chi_01,psi_01) VADD(psi_02,chi_02,psi_02) \
|
||||
VADD(psi_10,chi_10,psi_10) VADD(psi_11,chi_11,psi_11) VADD(psi_12,chi_12,psi_12) \
|
||||
VADD(psi_20,chi_20,psi_20) VADD(psi_21,chi_21,psi_21) VADD(psi_22,chi_22,psi_22) \
|
||||
VADD(psi_30,chi_30,psi_30) VADD(psi_31,chi_31,psi_31) VADD(psi_32,chi_32,psi_32) ); \
|
||||
SAVE_RESULT(PTR,pf);
|
||||
|
||||
|
||||
#define PERMUTE_DIR3
|
||||
#define PERMUTE_DIR2
|
||||
#define PERMUTE_DIR1
|
||||
|
||||
#define PERMUTE_DIR0 { \
|
||||
asm( \
|
||||
VPERMI(perm_reg) \
|
||||
VPERM(Chi_00,perm_reg) VPERM(Chi_01,perm_reg) VPERM(Chi_02,perm_reg) \
|
||||
VPERM(Chi_10,perm_reg) VPERM(Chi_11,perm_reg) VPERM(Chi_12,perm_reg) ); \
|
||||
}
|
||||
|
||||
#endif
|
46
Grid/simd/IBM_qpx_double.h
Normal file
46
Grid/simd/IBM_qpx_double.h
Normal file
@ -0,0 +1,46 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
// No guard; ok multi-include
|
||||
#undef VSIZE
|
||||
#undef VLOAD
|
||||
#undef VLOADu
|
||||
#undef VSPLAT
|
||||
#undef VSTORE
|
||||
#undef VSTOREu
|
||||
#undef MULT_2SPIN_QPX_LS
|
||||
#undef MULT_2SPIN_QPX
|
||||
|
||||
#define VSIZE VSIZEd
|
||||
#define VLOAD(A,B,C) VLOADd(A,B,C)
|
||||
#define VLOADu(A,B,C) VLOADud(A,B,C)
|
||||
#define VSPLAT(A,B,DEST) VSPLATd(A,B,DEST)
|
||||
#define VSTORE(A,B,C) VSTOREd(A,B,C)
|
||||
#define VSTOREu(A,B,C) VSTOREud(A,B,C)
|
||||
#define MULT_2SPIN_QPX_LS(ptr,p) MULT_2SPIN_QPX_LSd(ptr,p)
|
||||
#define MULT_2SPIN_QPX(ptr,p) MULT_2SPIN_QPXd(ptr,p)
|
||||
|
46
Grid/simd/IBM_qpx_single.h
Normal file
46
Grid/simd/IBM_qpx_single.h
Normal file
@ -0,0 +1,46 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
// No guard; ok multi-include
|
||||
#undef VSIZE
|
||||
#undef VLOAD
|
||||
#undef VLOADu
|
||||
#undef VSPLAT
|
||||
#undef VSTORE
|
||||
#undef VSTOREu
|
||||
#undef MULT_2SPIN_QPX_LS
|
||||
#undef MULT_2SPIN_QPX
|
||||
|
||||
#define VSIZE VSIZEf
|
||||
#define VLOAD(A,B,C) VLOADf(A,B,C)
|
||||
#define VLOADu(A,B,C) VLOADuf(A,B,C)
|
||||
#define VSPLAT(A,B,DEST) VSPLATf(A,B,DEST)
|
||||
#define VSTORE(A,B,C) VSTOREf(A,B,C)
|
||||
#define VSTOREu(A,B,C) VSTOREuf(A,B,C)
|
||||
#define MULT_2SPIN_QPX_LS(ptr,p) MULT_2SPIN_QPX_LSf(ptr,p)
|
||||
#define MULT_2SPIN_QPX(ptr,p) MULT_2SPIN_QPXf(ptr,p)
|
||||
|
205
Grid/simd/Intel512avx.h
Normal file
205
Grid/simd/Intel512avx.h
Normal file
@ -0,0 +1,205 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_AV512_H
|
||||
#define GRID_ASM_AV512_H
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Knights Landing specials
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||
|
||||
#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr)
|
||||
#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr)
|
||||
|
||||
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
|
||||
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
|
||||
|
||||
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
|
||||
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
|
||||
|
||||
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMULMEMf(O,P,B,Biirr) \
|
||||
VMULMEMf(O,P,C,Ciirr) \
|
||||
VMULf(tmp,B,Briir) \
|
||||
VMULf(tmp,C,Criir)
|
||||
|
||||
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMULMEMd(O,P,B,Biirr) \
|
||||
VMULMEMd(O,P,C,Ciirr) \
|
||||
VMULd(tmp,B,Briir) \
|
||||
VMULd(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMADDMEMf(O,P,B,Biirr) \
|
||||
VMADDMEMf(O,P,C,Ciirr) \
|
||||
VMADDf(tmp,B,Briir) \
|
||||
VMADDf(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMADDMEMd(O,P,B,Biirr) \
|
||||
VMADDMEMd(O,P,C,Ciirr) \
|
||||
VMADDd(tmp,B,Briir) \
|
||||
VMADDd(tmp,C,Criir)
|
||||
|
||||
// Merges accumulation for complex dot chain; less efficient under avx512
|
||||
#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n"\
|
||||
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
|
||||
|
||||
#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\
|
||||
"vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||
|
||||
#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\
|
||||
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
|
||||
|
||||
#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\
|
||||
"vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
|
||||
|
||||
#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||
#define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2
|
||||
#define VMOVRDUPf(OFF,PTR,DEST) "vmovsldup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
#define VMOVIDUPf(OFF,PTR,DEST) "vmovshdup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
|
||||
#define VRDUPd(SRC,DEST) "vpshufd $0x44," #SRC"," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||
#define VRDUPf(SRC,DEST) "vmovsldup " #SRC ", " #DEST ";\n"
|
||||
#define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2
|
||||
#define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n"
|
||||
|
||||
#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n"
|
||||
#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n"
|
||||
#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n"
|
||||
#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n"
|
||||
#define VBCASTCDUPf(OFF,A,DEST) "vbroadcastsd (" #OFF "*64 )(" #A ")," #DEST ";\n"
|
||||
#define VBCASTZDUPf(OFF,A,DEST) "vbroadcastf32x4 (" #OFF "*64 )(" #A ")," #DEST ";\n"
|
||||
#define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST)
|
||||
#define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST)
|
||||
|
||||
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n"
|
||||
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n"
|
||||
#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps " #O"*64("#P "),"#B "," #accum ";\n"
|
||||
#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n"
|
||||
|
||||
|
||||
#define VMADDRDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
|
||||
#define VMADDIDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
|
||||
#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
|
||||
#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
|
||||
#define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
|
||||
#define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
|
||||
|
||||
#define VMADDRDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
|
||||
#define VMADDIDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
|
||||
#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
|
||||
#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
|
||||
#define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
|
||||
#define VMULIDUPd(O,P,B,accum) "vmulpd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
|
||||
/*
|
||||
* TimesI is used only in the XP recon
|
||||
* Could zero the regs and use RECON_ACCUM
|
||||
*/
|
||||
|
||||
#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
|
||||
#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#define VTIMESI0d(A,DEST, Z) VSHUFd(A,DEST)
|
||||
#define VTIMESI1d(A,DEST, Z) "vaddpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESI2d(A,DEST, Z) "vsubpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0f(A,DEST,Z) VSHUFf(A,DEST)
|
||||
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0d(A,DEST,Z) VSHUFd(A,DEST)
|
||||
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#if 0
|
||||
|
||||
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#else
|
||||
|
||||
// o_p must point to floating 1.0f/d
|
||||
//
|
||||
// Ai, Ar -> tmp (r i)
|
||||
// tmp *1.0
|
||||
// ACC i - Ar ; ACC r + Ai
|
||||
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) VMADDMEMf(1,%r10,tmp,ACC)
|
||||
#define VACCTIMESMINUSI2f(A,ACC,tmp)
|
||||
|
||||
|
||||
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESMINUSI1d(A,ACC,tmp) VMADDMEMd(1,%r10,tmp,ACC)
|
||||
#define VACCTIMESMINUSI2d(A,ACC,tmp)
|
||||
|
||||
// Ai, Ar -> tmp (r i)
|
||||
// tmp *1.0
|
||||
// ACC i + Ar ; ACC r - Ai
|
||||
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESI1f(A,ACC,tmp) VMADDMEMf(0,%r10,tmp,ACC)
|
||||
#define VACCTIMESI2f(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESI1d(A,ACC,tmp) VMADDMEMd(0,%r10,tmp,ACC)
|
||||
#define VACCTIMESI2d(A,ACC,tmp)
|
||||
|
||||
#endif
|
||||
|
||||
#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n"
|
||||
#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n"
|
||||
#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n"
|
||||
#define VPERM3f(A,B) "vshufps $0xb1," #A "," #B "," #B ";\n"
|
||||
|
||||
#define VPERM0d(A,B) "vshuff64x2 $0x4e," #A "," #B "," #B ";\n"
|
||||
#define VPERM1d(A,B) "vshuff64x2 $0xb1," #A "," #B "," #B ";\n"
|
||||
#define VPERM2d(A,B) "vshufpd $0x55," #A "," #B "," #B ";\n"
|
||||
#define VPERM3d(A,B) VMOVd(A,B)
|
||||
|
||||
|
||||
#endif
|
159
Grid/simd/Intel512common.h
Normal file
159
Grid/simd/Intel512common.h
Normal file
@ -0,0 +1,159 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_INTEL_COMMON_512_H
|
||||
#define GRID_ASM_INTEL_COMMON_512_H
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Peformance options
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#undef AVX512_PF_L2_WRITE
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Opcodes common
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#define MASK_REGS \
|
||||
__asm__ ("mov $0xAAAA, %%eax \n"\
|
||||
"kmovw %%eax, %%k6 \n"\
|
||||
"mov $0x5555, %%eax \n"\
|
||||
"kmovw %%eax, %%k7 \n" : : : "%eax");
|
||||
|
||||
//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" );
|
||||
|
||||
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||
|
||||
#define VTIMESIf(A,DEST, Z) \
|
||||
VTIMESI0f(A,DEST, Z) \
|
||||
VTIMESI1f(A,DEST, Z) \
|
||||
VTIMESI2f(A,DEST, Z)
|
||||
|
||||
#define VTIMESId(A,DEST, Z) \
|
||||
VTIMESI0d(A,DEST, Z) \
|
||||
VTIMESI1d(A,DEST, Z) \
|
||||
VTIMESI2d(A,DEST, Z)
|
||||
|
||||
#define VTIMESMINUSIf(A,DEST, Z) \
|
||||
VTIMESMINUSI0f(A,DEST, Z) \
|
||||
VTIMESMINUSI1f(A,DEST, Z) \
|
||||
VTIMESMINUSI2f(A,DEST, Z)
|
||||
|
||||
#define VTIMESMINUSId(A,DEST, Z) \
|
||||
VTIMESMINUSI0d(A,DEST, Z) \
|
||||
VTIMESMINUSI1d(A,DEST, Z) \
|
||||
VTIMESMINUSI2d(A,DEST, Z)
|
||||
|
||||
#define VACCTIMESIf(A,ACC,tmp) \
|
||||
VACCTIMESI0f(A,ACC,tmp) \
|
||||
VACCTIMESI1f(A,ACC,tmp) \
|
||||
VACCTIMESI2f(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESId(A,ACC,tmp) \
|
||||
VACCTIMESI0d(A,ACC,tmp) \
|
||||
VACCTIMESI1d(A,ACC,tmp) \
|
||||
VACCTIMESI2d(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESMINUSIf(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI0f(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI1f(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI2f(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESMINUSId(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI0d(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI1d(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI2d(A,ACC,tmp)
|
||||
|
||||
#define LOAD64a(A,ptr) "movq %0, %" #A : : "r"(ptr) : #A
|
||||
#define LOAD64i(A,ptr) __asm__ ( LOAD64a(A,ptr));
|
||||
#define LOAD64(A,ptr) LOAD64i(A,ptr)
|
||||
|
||||
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
|
||||
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
|
||||
|
||||
#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n"
|
||||
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
|
||||
#ifdef AVX512_PF_L2_WRITE
|
||||
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
|
||||
#else
|
||||
#define VPREFETCHW(O,A)
|
||||
#endif
|
||||
#define VPREFETCHNTA(O,A)
|
||||
#define VPREFETCH(O,A)
|
||||
|
||||
#define VEVICT(O,A)
|
||||
|
||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
||||
// "clevict0 "#O"*64("#A");\n"
|
||||
|
||||
#define VLOADf(OFF,PTR,DEST) "vmovups " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
#define VLOADd(OFF,PTR,DEST) "vmovupd " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
|
||||
#define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n"
|
||||
#define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VSUBf(A,B,DEST) "vsubps " #A "," #B "," #DEST ";\n"
|
||||
#define VSUBd(A,B,DEST) "vsubpd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VADDMEMf(O,A,B,DEST) "vaddps "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
#define VADDMEMd(O,A,B,DEST) "vaddpd "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
|
||||
#define VSUBMEMf(O,A,B,DEST) "vsubps "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
#define VSUBMEMd(O,A,B,DEST) "vsubpd "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
|
||||
#define VMULf(A,B,DEST) "vmulps " #A "," #B "," #DEST ";\n"
|
||||
#define VMULd(A,B,DEST) "vmulpd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VMADDf(A,B,DEST) "vfmadd231ps " #A "," #B "," #DEST ";\n"
|
||||
#define VMADDd(A,B,DEST) "vfmadd231pd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VMULMEMf(O,A,B,DEST) "vmulps " #O"*64("#A ")," #B "," #DEST ";\n"
|
||||
#define VMULMEMd(O,A,B,DEST) "vmulpd " #O"*64("#A ")," #B "," #DEST ";\n"
|
||||
|
||||
#define VMADDMEMf(O,A,B,DEST) "vfmadd231ps " #O"*64("#A "),"#B "," #DEST ";\n"
|
||||
#define VMADDMEMd(O,A,B,DEST) "vfmadd231pd " #O"*64("#A "),"#B "," #DEST ";\n"
|
||||
|
||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||
|
||||
#define STREAM_STORE
|
||||
#ifdef STREAM_STORE
|
||||
#define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
#define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
#else
|
||||
#define VSTOREf(OFF,PTR,SRC) "vmovups " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
#define VSTOREd(OFF,PTR,SRC) "vmovupd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
#endif
|
||||
|
||||
// Swaps Re/Im ; could unify this with IMCI
|
||||
#define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n"
|
||||
#define VSHUFf(A,DEST) "vpshufd $0xb1," #A "," #DEST ";\n"
|
||||
#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||
#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 2,3,0,1
|
||||
|
||||
#define TRAP " int3 ;\n"
|
||||
|
||||
#endif
|
156
Grid/simd/Intel512double.h
Normal file
156
Grid/simd/Intel512double.h
Normal file
@ -0,0 +1,156 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
// No guard can be multiply included as undef clearage
|
||||
#undef VZERO
|
||||
#undef VMOV
|
||||
#undef VLOAD
|
||||
#undef VSTORE
|
||||
#define VZERO(A) VZEROd(A)
|
||||
#define VMOV(A,B) VMOVd(A,B)
|
||||
#define VLOAD(OFF,PTR,DEST) VLOADd(OFF,PTR,DEST)
|
||||
#define VSTORE(OFF,PTR,SRC) VSTOREd(OFF,PTR,SRC)
|
||||
|
||||
#undef VADD
|
||||
#undef VSUB
|
||||
#undef VMUL
|
||||
#undef VMADD
|
||||
#define VADD(A,B,C) VADDd(A,B,C)
|
||||
#define VSUB(A,B,C) VSUBd(A,B,C)
|
||||
#define VMUL(Uri,Uir,Chi) VMULd(Uri,Uir,Chi)
|
||||
#define VMADD(Uri,Uir,Chi) VMADDd(Uri,Uir,Chi)
|
||||
|
||||
|
||||
#undef VTIMESI
|
||||
#undef VTIMESI0
|
||||
#undef VTIMESI1
|
||||
#undef VTIMESI2
|
||||
#define VTIMESI(A,B,C) VTIMESId(A,B,C)
|
||||
#define VTIMESI0(A,B,C) VTIMESI0d(A,B,C)
|
||||
#define VTIMESI1(A,B,C) VTIMESI1d(A,B,C)
|
||||
#define VTIMESI2(A,B,C) VTIMESI2d(A,B,C)
|
||||
|
||||
#undef VTIMESMINUSI
|
||||
#undef VTIMESMINUSI0
|
||||
#undef VTIMESMINUSI1
|
||||
#undef VTIMESMINUSI2
|
||||
#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C)
|
||||
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0d(A,B,C)
|
||||
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1d(A,B,C)
|
||||
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2d(A,B,C)
|
||||
|
||||
#undef VACCTIMESI
|
||||
#undef VACCTIMESI0
|
||||
#undef VACCTIMESI1
|
||||
#undef VACCTIMESI2
|
||||
#define VACCTIMESI(A,B,C) VACCTIMESId(A,B,C)
|
||||
#define VACCTIMESI0(A,B,C) VACCTIMESI0d(A,B,C)
|
||||
#define VACCTIMESI1(A,B,C) VACCTIMESI1d(A,B,C)
|
||||
#define VACCTIMESI2(A,B,C) VACCTIMESI2d(A,B,C)
|
||||
|
||||
#undef VACCTIMESMINUSI
|
||||
#undef VACCTIMESMINUSI0
|
||||
#undef VACCTIMESMINUSI1
|
||||
#undef VACCTIMESMINUSI2
|
||||
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSId(A,B,C)
|
||||
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0d(A,B,C)
|
||||
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1d(A,B,C)
|
||||
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2d(A,B,C)
|
||||
|
||||
#undef VACCTIMESI1MEM
|
||||
#undef VACCTIMESI2MEM
|
||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMd(A,ACC,O,P)
|
||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMd(A,ACC,O,P)
|
||||
|
||||
#undef VACCTIMESMINUSI1MEM
|
||||
#undef VACCTIMESMINUSI2MEM
|
||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
|
||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
|
||||
|
||||
#undef VPERM0
|
||||
#undef VPERM1
|
||||
#undef VPERM2
|
||||
#undef VPERM3
|
||||
#define VPERM0(A,B) VPERM0d(A,B)
|
||||
#define VPERM1(A,B) VPERM1d(A,B)
|
||||
#define VPERM2(A,B) VPERM2d(A,B)
|
||||
#define VPERM3(A,B) VPERM3d(A,B)
|
||||
|
||||
#undef VSHUFMEM
|
||||
#undef VADDMEM
|
||||
#undef VSUBMEM
|
||||
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMd(OFF,A,DEST)
|
||||
#define VADDMEM(O,A,B,C) VADDMEMd(O,A,B,C)
|
||||
#define VSUBMEM(O,A,B,C) VSUBMEMd(O,A,B,C)
|
||||
|
||||
#undef VMOVIDUP
|
||||
#undef VMOVRDUP
|
||||
#undef VMADDSUB
|
||||
#undef VSHUF
|
||||
#define VMOVIDUP(A,B,C) VMOVIDUPd(A,B,C)
|
||||
#define VMOVRDUP(A,B,C) VMOVRDUPd(A,B,C)
|
||||
#define VMADDSUB(A,B,accum) VMADDSUBd(A,B,accum)
|
||||
#define VSHUF(A,B) VSHUFd(A,B)
|
||||
|
||||
|
||||
#undef ZEND1
|
||||
#undef ZEND2
|
||||
#undef ZLOAD
|
||||
#undef ZMUL
|
||||
#undef ZMADD
|
||||
#undef ZMULMEM2SP
|
||||
#undef ZMADDMEM2SP
|
||||
|
||||
#define ZEND1(A,B,C) ZEND1d(A,B,C)
|
||||
#define ZEND2(A,B,C) ZEND2d(A,B,C)
|
||||
#define ZLOAD(A,B,C,D) ZLOADd(A,B,C,D)
|
||||
#define ZMUL(A,B,C,D,E) ZMULd(A,B,C,D,E)
|
||||
#define ZMADD(A,B,C,D,E) ZMADDd(A,B,C,D,E)
|
||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
|
||||
|
||||
#undef VRDUP
|
||||
#undef VIDUP
|
||||
#undef VMADDSUBMEM
|
||||
#undef VMADDMEM
|
||||
#undef VMULMEM
|
||||
#define VRDUP(SRC,DEST) VRDUPd(SRC,DEST)
|
||||
#define VIDUP(SRC,DEST) VIDUPd(SRC,DEST)
|
||||
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
|
||||
#define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum)
|
||||
#define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum)
|
||||
#undef VMADDRDUP
|
||||
#undef VMADDSUBRDUP
|
||||
#undef VMADDSUBIDUP
|
||||
#undef VMULRDUP
|
||||
#undef VMULIDUP
|
||||
#define VMADDRDUP(O,P,B,accum) VMADDRDUPd(O,P,B,accum)
|
||||
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum)
|
||||
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum)
|
||||
#define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum)
|
||||
#define VMULIDUP(O,P,B,accum) VMULIDUPd(O,P,B,accum)
|
127
Grid/simd/Intel512imci.h
Normal file
127
Grid/simd/Intel512imci.h
Normal file
@ -0,0 +1,127 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_AV512_H
|
||||
#define GRID_ASM_AV512_H
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Knights Corner specials
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||
|
||||
#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr)
|
||||
#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr)
|
||||
|
||||
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
|
||||
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
|
||||
|
||||
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
|
||||
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
|
||||
|
||||
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMULMEMf(O,P,B,Biirr) \
|
||||
VMULMEMf(O,P,C,Ciirr) \
|
||||
VMULf(tmp,B,Briir) \
|
||||
VMULf(tmp,C,Criir)
|
||||
|
||||
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMULMEMd(O,P,B,Biirr) \
|
||||
VMULMEMd(O,P,C,Ciirr) \
|
||||
VMULd(tmp,B,Briir) \
|
||||
VMULd(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMADDMEMf(O,P,B,Biirr) \
|
||||
VMADDMEMf(O,P,C,Ciirr) \
|
||||
VMADDf(tmp,B,Briir) \
|
||||
VMADDf(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMADDMEMd(O,P,B,Biirr) \
|
||||
VMADDMEMd(O,P,C,Ciirr) \
|
||||
VMADDd(tmp,B,Briir) \
|
||||
VMADDd(tmp,C,Criir)
|
||||
|
||||
#define ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
|
||||
#define ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||
|
||||
#define ZEND1f(Criir,Ciirr, tmp) "vaddps " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
|
||||
#define ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||
|
||||
#define VTIMESI0f(A,DEST, Z)
|
||||
#define VTIMESI1f(A,DEST, Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESI2f(A,DEST, Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESI0d(A,DEST, Z)
|
||||
#define VTIMESI1d(A,DEST, Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESI2d(A,DEST, Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0f(A,DEST,Z)
|
||||
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0d(A,DEST,Z)
|
||||
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESI0f(A,ACC,tmp)
|
||||
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESI0d(A,ACC,tmp)
|
||||
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESMINUSI0f(A,ACC,tmp)
|
||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
// Acc = Acc - i A
|
||||
#define VACCTIMESMINUSI0d(A,ACC,tmp)
|
||||
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
|
||||
//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
|
||||
|
||||
#define VPERM0f(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n"
|
||||
#define VPERM1f(A,B) "vpermf32x4 $0xb1," #A "," #B ";\n"
|
||||
#define VPERM2f(A,B) "vmovaps " #A "{badc}," #B ";\n"
|
||||
#define VPERM3f(A,B) "vmovaps " #A "{cdab}," #B ";\n"
|
||||
|
||||
#define VPERM0d(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n"
|
||||
#define VPERM1d(A,B) "vmovapd " #A "{badc}," #B ";\n"
|
||||
#define VPERM2d(A,B) "vmovapd " #A "{cdab}," #B ";\n"
|
||||
#define VPERM3d(A,B) VMOVd(A,B)
|
||||
|
||||
#endif
|
157
Grid/simd/Intel512single.h
Normal file
157
Grid/simd/Intel512single.h
Normal file
@ -0,0 +1,157 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
// No guard can be multiply included as undef clearge of macros
|
||||
#undef VZERO
|
||||
#undef VMOV
|
||||
#undef VLOAD
|
||||
#undef VSTORE
|
||||
#define VZERO(A) VZEROf(A)
|
||||
#define VMOV(A,B) VMOVf(A,B)
|
||||
#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST)
|
||||
#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC)
|
||||
|
||||
#undef VADD
|
||||
#undef VSUB
|
||||
#undef VMUL
|
||||
#undef VMADD
|
||||
#define VADD(A,B,C) VADDf(A,B,C)
|
||||
#define VSUB(A,B,C) VSUBf(A,B,C)
|
||||
#define VMUL(Uri,Uir,Chi) VMULf(Uri,Uir,Chi)
|
||||
#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
|
||||
|
||||
|
||||
#undef VTIMESI
|
||||
#undef VTIMESI0
|
||||
#undef VTIMESI1
|
||||
#undef VTIMESI2
|
||||
#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
|
||||
#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C)
|
||||
#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C)
|
||||
#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C)
|
||||
|
||||
#undef VTIMESMINUSI
|
||||
#undef VTIMESMINUSI0
|
||||
#undef VTIMESMINUSI1
|
||||
#undef VTIMESMINUSI2
|
||||
#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
|
||||
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C)
|
||||
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C)
|
||||
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C)
|
||||
|
||||
#undef VACCTIMESI
|
||||
#undef VACCTIMESI0
|
||||
#undef VACCTIMESI1
|
||||
#undef VACCTIMESI2
|
||||
#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C)
|
||||
#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C)
|
||||
#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C)
|
||||
#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C)
|
||||
|
||||
#undef VACCTIMESMINUSI
|
||||
#undef VACCTIMESMINUSI0
|
||||
#undef VACCTIMESMINUSI1
|
||||
#undef VACCTIMESMINUSI2
|
||||
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C)
|
||||
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C)
|
||||
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C)
|
||||
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C)
|
||||
|
||||
#undef VACCTIMESI1MEM
|
||||
#undef VACCTIMESI2MEM
|
||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
|
||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
|
||||
|
||||
#undef VACCTIMESMINUSI1MEM
|
||||
#undef VACCTIMESMINUSI2MEM
|
||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
|
||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
|
||||
|
||||
#undef VPERM0
|
||||
#undef VPERM1
|
||||
#undef VPERM2
|
||||
#undef VPERM3
|
||||
#define VPERM0(A,B) VPERM0f(A,B)
|
||||
#define VPERM1(A,B) VPERM1f(A,B)
|
||||
#define VPERM2(A,B) VPERM2f(A,B)
|
||||
#define VPERM3(A,B) VPERM3f(A,B)
|
||||
|
||||
#undef VSHUFMEM
|
||||
#undef VADDMEM
|
||||
#undef VSUBMEM
|
||||
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST)
|
||||
#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C)
|
||||
#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C)
|
||||
|
||||
#undef VMOVIDUP
|
||||
#undef VMOVRDUP
|
||||
#undef VMADDSUB
|
||||
#undef VSHUF
|
||||
#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
|
||||
#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
|
||||
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum)
|
||||
#define VSHUF(A,B) VSHUFf(A,B)
|
||||
|
||||
#undef ZEND1
|
||||
#undef ZEND2
|
||||
#undef ZLOAD
|
||||
#undef ZMUL
|
||||
#undef ZMADD
|
||||
#undef ZMULMEM2SP
|
||||
#undef ZMADDMEM2SP
|
||||
|
||||
#define ZEND1(A,B,C) ZEND1f(A,B,C)
|
||||
#define ZEND2(A,B,C) ZEND2f(A,B,C)
|
||||
#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D)
|
||||
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
|
||||
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
|
||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
|
||||
#undef VRDUP
|
||||
#undef VIDUP
|
||||
#undef VMADDSUBMEM
|
||||
#undef VMADDMEM
|
||||
#undef VMULMEM
|
||||
|
||||
#define VRDUP(SRC,DEST) VRDUPf(SRC,DEST)
|
||||
#define VIDUP(SRC,DEST) VIDUPf(SRC,DEST)
|
||||
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
|
||||
#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
|
||||
#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
|
||||
|
||||
#undef VMADDRDUP
|
||||
#undef VMADDSUBRDUP
|
||||
#undef VMADDSUBIDUP
|
||||
#undef VMULRDUP
|
||||
#undef VMULIDUP
|
||||
#define VMADDRDUP(O,P,B,accum) VMADDRDUPf(O,P,B,accum)
|
||||
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum)
|
||||
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum)
|
||||
#define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum)
|
||||
#define VMULIDUP(O,P,B,accum) VMULIDUPf(O,P,B,accum)
|
||||
|
938
Grid/simd/Intel512wilson.h
Normal file
938
Grid/simd/Intel512wilson.h
Normal file
@ -0,0 +1,938 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_INTEL_512_QCD_H
|
||||
#define GRID_ASM_INTEL_512_QCD_H
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Register allocations for Wilson Kernel are precision indept
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
#define psi_00 %zmm0
|
||||
#define psi_01 %zmm1
|
||||
#define psi_02 %zmm2
|
||||
|
||||
#define psi_10 %zmm3
|
||||
#define psi_11 %zmm4
|
||||
#define psi_12 %zmm5
|
||||
|
||||
#define psi_20 %zmm6
|
||||
#define psi_21 %zmm7
|
||||
#define psi_22 %zmm8
|
||||
|
||||
#define psi_30 %zmm9
|
||||
#define psi_31 %zmm10
|
||||
#define psi_32 %zmm11
|
||||
|
||||
#define Chi_00 %zmm12
|
||||
#define Chi_01 %zmm13
|
||||
#define Chi_02 %zmm14
|
||||
|
||||
#define Chi_10 %zmm15
|
||||
#define Chi_11 %zmm16
|
||||
#define Chi_12 %zmm17
|
||||
|
||||
#define UChi_00 %zmm18
|
||||
#define UChi_01 %zmm19
|
||||
#define UChi_02 %zmm20
|
||||
|
||||
#define UChi_10 %zmm21
|
||||
#define UChi_11 %zmm22
|
||||
#define UChi_12 %zmm23
|
||||
|
||||
#define Uir %zmm24
|
||||
#define Uri %zmm25
|
||||
#define T1 %zmm24
|
||||
#define T2 %zmm25
|
||||
|
||||
#define Z0 %zmm26
|
||||
#define Z1 %zmm27
|
||||
#define Z2 %zmm28
|
||||
#define Z3 %zmm29
|
||||
#define Z4 %zmm30
|
||||
#define Z5 %zmm31
|
||||
|
||||
#define TMP Chi_00
|
||||
|
||||
#define Chimu_00 Chi_00
|
||||
#define Chimu_01 Chi_01
|
||||
#define Chimu_02 Chi_02
|
||||
#define Chimu_10 Chi_10
|
||||
#define Chimu_11 Chi_11
|
||||
#define Chimu_12 Chi_12
|
||||
#define Chimu_20 UChi_00
|
||||
#define Chimu_21 UChi_01
|
||||
#define Chimu_22 UChi_02
|
||||
#define Chimu_30 UChi_10
|
||||
#define Chimu_31 UChi_11
|
||||
#define Chimu_32 UChi_12
|
||||
|
||||
#include "Intel512common.h"
|
||||
#include "Intel512avx.h"
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Macros used to build wilson kernel -- can rationalise and simplify
|
||||
// a little as some duplication developed during trying different
|
||||
// variants during optimisation. Could cut back to only those used.
|
||||
//////////////////////////////////////////////////////////////////
|
||||
#define LOCK_GAUGE(dir)
|
||||
#define UNLOCK_GAUGE(dir)
|
||||
|
||||
// const SiteSpinor * ptr = & in._odata[offset];
|
||||
#define LOAD_CHIMU(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIMUi );
|
||||
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
|
||||
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
|
||||
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
|
||||
#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
|
||||
#define ADD_RESULT(PT,R) ADD_RESULTi(PT,R)
|
||||
|
||||
#define ZERO_PSI \
|
||||
asm( VZERO(psi_00) \
|
||||
VZERO(psi_01) \
|
||||
VZERO(psi_02) \
|
||||
VZERO(psi_10) \
|
||||
VZERO(psi_11) \
|
||||
VZERO(psi_12) \
|
||||
VZERO(psi_20) \
|
||||
VZERO(psi_21) \
|
||||
VZERO(psi_22) \
|
||||
VZERO(psi_30) \
|
||||
VZERO(psi_31) \
|
||||
VZERO(psi_32));
|
||||
|
||||
#define LOAD_CHIMUi \
|
||||
LOAD_CHIMU01i \
|
||||
LOAD_CHIMU23i
|
||||
|
||||
#define LOAD_CHIMU01i \
|
||||
VLOAD(0,%r8,Chimu_00) \
|
||||
VLOAD(1,%r8,Chimu_01) \
|
||||
VLOAD(2,%r8,Chimu_02) \
|
||||
VLOAD(3,%r8,Chimu_10) \
|
||||
VLOAD(4,%r8,Chimu_11) \
|
||||
VLOAD(5,%r8,Chimu_12)
|
||||
|
||||
#define LOAD_CHIMU23i \
|
||||
VLOAD(6,%r8,Chimu_20) \
|
||||
VLOAD(7,%r8,Chimu_21) \
|
||||
VLOAD(8,%r8,Chimu_22) \
|
||||
VLOAD(9,%r8,Chimu_30) \
|
||||
VLOAD(10,%r8,Chimu_31) \
|
||||
VLOAD(11,%r8,Chimu_32)
|
||||
|
||||
#define SHUF_CHIMU23i\
|
||||
VSHUFMEM(6,%r8,Chimu_20) \
|
||||
VSHUFMEM(7,%r8,Chimu_21) \
|
||||
VSHUFMEM(8,%r8,Chimu_22) \
|
||||
VSHUFMEM(9,%r8,Chimu_30) \
|
||||
VSHUFMEM(10,%r8,Chimu_31) \
|
||||
VSHUFMEM(11,%r8,Chimu_32)
|
||||
|
||||
#define LOAD_CHIi \
|
||||
VLOAD(0,%r8,Chi_00) \
|
||||
VLOAD(1,%r8,Chi_01) \
|
||||
VLOAD(2,%r8,Chi_02) \
|
||||
VLOAD(3,%r8,Chi_10) \
|
||||
VLOAD(4,%r8,Chi_11) \
|
||||
VLOAD(5,%r8,Chi_12)
|
||||
|
||||
#define SAVE_UCHIi(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
VSTORE(0,%r8,UChi_00) \
|
||||
VSTORE(1,%r8,UChi_01) \
|
||||
VSTORE(2,%r8,UChi_02) \
|
||||
VSTORE(3,%r8,UChi_10) \
|
||||
VSTORE(4,%r8,UChi_11) \
|
||||
VSTORE(5,%r8,UChi_12) );
|
||||
|
||||
#define SAVE_CHIi(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
VSTORE(0,%r8,Chi_00) \
|
||||
VSTORE(1,%r8,Chi_01) \
|
||||
VSTORE(2,%r8,Chi_02) \
|
||||
VSTORE(3,%r8,Chi_10) \
|
||||
VSTORE(4,%r8,Chi_11) \
|
||||
VSTORE(5,%r8,Chi_12) );
|
||||
|
||||
#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Dirac algebra
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
#define XP_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIi \
|
||||
SHUF_CHIMU23i \
|
||||
VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \
|
||||
VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \
|
||||
VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \
|
||||
VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \
|
||||
VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \
|
||||
VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \
|
||||
VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \
|
||||
VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \
|
||||
VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \
|
||||
VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \
|
||||
VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \
|
||||
VACCTIMESI2(Chi_12,Chi_12,Chimu_22) );
|
||||
|
||||
|
||||
#define YP_PROJMEM(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIMU01i \
|
||||
VSUBMEM(9,%r8 ,Chimu_00,Chi_00) \
|
||||
VSUBMEM(10,%r8,Chimu_01,Chi_01) \
|
||||
VSUBMEM(11,%r8,Chimu_02,Chi_02) \
|
||||
VADDMEM(6,%r8,Chimu_10,Chi_10) \
|
||||
VADDMEM(7,%r8,Chimu_11,Chi_11) \
|
||||
VADDMEM(8,%r8,Chimu_12,Chi_12) );
|
||||
|
||||
#define ZP_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIi \
|
||||
SHUF_CHIMU23i \
|
||||
VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \
|
||||
VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \
|
||||
VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \
|
||||
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \
|
||||
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \
|
||||
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \
|
||||
VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \
|
||||
VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \
|
||||
VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \
|
||||
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30) \
|
||||
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31) \
|
||||
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32) );
|
||||
|
||||
|
||||
#define TP_PROJMEM(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIMU01i \
|
||||
VADDMEM(6,%r8 ,Chimu_00,Chi_00) \
|
||||
VADDMEM(7,%r8,Chimu_01,Chi_01) \
|
||||
VADDMEM(8,%r8,Chimu_02,Chi_02) \
|
||||
VADDMEM(9,%r8,Chimu_10,Chi_10) \
|
||||
VADDMEM(10,%r8,Chimu_11,Chi_11) \
|
||||
VADDMEM(11,%r8,Chimu_12,Chi_12) );
|
||||
|
||||
// hspin(0)=fspin(0)-timesI(fspin(3))
|
||||
// hspin(1)=fspin(1)-timesI(fspin(2))
|
||||
#define XM_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR)\
|
||||
__asm__ ( \
|
||||
LOAD_CHIi \
|
||||
SHUF_CHIMU23i \
|
||||
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
|
||||
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
|
||||
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
|
||||
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\
|
||||
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\
|
||||
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\
|
||||
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\
|
||||
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\
|
||||
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\
|
||||
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\
|
||||
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\
|
||||
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
|
||||
|
||||
#define YM_PROJMEM(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIMU01i \
|
||||
VADDMEM(9,%r8 ,Chimu_00,Chi_00) \
|
||||
VADDMEM(10,%r8,Chimu_01,Chi_01) \
|
||||
VADDMEM(11,%r8,Chimu_02,Chi_02) \
|
||||
VSUBMEM(6,%r8,Chimu_10,Chi_10) \
|
||||
VSUBMEM(7,%r8,Chimu_11,Chi_11) \
|
||||
VSUBMEM(8,%r8,Chimu_12,Chi_12) );
|
||||
|
||||
#define ZM_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIi \
|
||||
SHUF_CHIMU23i \
|
||||
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
|
||||
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
|
||||
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
|
||||
VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\
|
||||
VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\
|
||||
VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\
|
||||
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\
|
||||
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\
|
||||
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\
|
||||
VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\
|
||||
VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\
|
||||
VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
|
||||
|
||||
#define TM_PROJMEM(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIMU01i \
|
||||
VSUBMEM(6,%r8,Chimu_00,Chi_00) \
|
||||
VSUBMEM(7,%r8,Chimu_01,Chi_01) \
|
||||
VSUBMEM(8,%r8,Chimu_02,Chi_02) \
|
||||
VSUBMEM(9,%r8,Chimu_10,Chi_10) \
|
||||
VSUBMEM(10,%r8,Chimu_11,Chi_11) \
|
||||
VSUBMEM(11,%r8,Chimu_12,Chi_12) );
|
||||
|
||||
// fspin(0)=hspin(0)
|
||||
// fspin(1)=hspin(1)
|
||||
// fspin(2)=timesMinusI(hspin(1))
|
||||
// fspin(3)=timesMinusI(hspin(0))
|
||||
#define XP_RECON __asm__ ( \
|
||||
VZERO(TMP) \
|
||||
VTIMESMINUSI0(UChi_00,psi_30,TMP) \
|
||||
VTIMESMINUSI0(UChi_10,psi_20,TMP) \
|
||||
VTIMESMINUSI0(UChi_01,psi_31,TMP) \
|
||||
VTIMESMINUSI0(UChi_11,psi_21,TMP) \
|
||||
VTIMESMINUSI0(UChi_02,psi_32,TMP) \
|
||||
VTIMESMINUSI0(UChi_12,psi_22,TMP) \
|
||||
VMOV(UChi_00,psi_00) \
|
||||
VMOV(UChi_10,psi_10) \
|
||||
VMOV(UChi_01,psi_01) \
|
||||
VMOV(UChi_11,psi_11) \
|
||||
VMOV(UChi_02,psi_02) \
|
||||
VMOV(UChi_12,psi_12) \
|
||||
VTIMESMINUSI1(UChi_10,psi_20,TMP) \
|
||||
VTIMESMINUSI1(UChi_11,psi_21,TMP) \
|
||||
VTIMESMINUSI1(UChi_12,psi_22,TMP) \
|
||||
VTIMESMINUSI1(UChi_00,psi_30,TMP) \
|
||||
VTIMESMINUSI1(UChi_01,psi_31,TMP) \
|
||||
VTIMESMINUSI1(UChi_02,psi_32,TMP) \
|
||||
VTIMESMINUSI2(UChi_10,psi_20,TMP) \
|
||||
VTIMESMINUSI2(UChi_11,psi_21,TMP) \
|
||||
VTIMESMINUSI2(UChi_12,psi_22,TMP) \
|
||||
VTIMESMINUSI2(UChi_00,psi_30,TMP) \
|
||||
VTIMESMINUSI2(UChi_01,psi_31,TMP) \
|
||||
VTIMESMINUSI2(UChi_02,psi_32,TMP) \
|
||||
);
|
||||
// NB could save 6 ops using addsub => 12 cycles
|
||||
#define XP_RECON_ACCUM __asm__ ( \
|
||||
VZERO(TMP)\
|
||||
VACCTIMESMINUSI0(UChi_00,psi_30,Z3)\
|
||||
VACCTIMESMINUSI0(UChi_10,psi_20,Z0)\
|
||||
VACCTIMESMINUSI0(UChi_01,psi_31,Z4)\
|
||||
VACCTIMESMINUSI0(UChi_11,psi_21,Z1)\
|
||||
VACCTIMESMINUSI0(UChi_02,psi_32,Z5)\
|
||||
VACCTIMESMINUSI0(UChi_12,psi_22,Z2)\
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VACCTIMESMINUSI1(UChi_00,psi_30,Z3)\
|
||||
VACCTIMESMINUSI1(UChi_10,psi_20,Z0)\
|
||||
VACCTIMESMINUSI1(UChi_01,psi_31,Z4)\
|
||||
VACCTIMESMINUSI1(UChi_11,psi_21,Z1)\
|
||||
VACCTIMESMINUSI1(UChi_02,psi_32,Z5)\
|
||||
VACCTIMESMINUSI1(UChi_12,psi_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_10,psi_20,Z0)\
|
||||
VACCTIMESMINUSI2(UChi_11,psi_21,Z1)\
|
||||
VACCTIMESMINUSI2(UChi_12,psi_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_00,psi_30,Z3)\
|
||||
VACCTIMESMINUSI2(UChi_01,psi_31,Z4)\
|
||||
VACCTIMESMINUSI2(UChi_02,psi_32,Z5)\
|
||||
);
|
||||
|
||||
#define XM_RECON __asm__ ( \
|
||||
VZERO(TMP)\
|
||||
VTIMESI0(UChi_00,psi_30,TMP)\
|
||||
VTIMESI0(UChi_10,psi_20,TMP)\
|
||||
VTIMESI0(UChi_01,psi_31,TMP)\
|
||||
VTIMESI0(UChi_11,psi_21,TMP)\
|
||||
VTIMESI0(UChi_02,psi_32,TMP)\
|
||||
VTIMESI0(UChi_12,psi_22,TMP)\
|
||||
VMOV(UChi_00,psi_00)\
|
||||
VMOV(UChi_10,psi_10)\
|
||||
VMOV(UChi_01,psi_01)\
|
||||
VMOV(UChi_11,psi_11)\
|
||||
VMOV(UChi_02,psi_02)\
|
||||
VMOV(UChi_12,psi_12)\
|
||||
VTIMESI1(UChi_00,psi_30,TMP)\
|
||||
VTIMESI1(UChi_10,psi_20,TMP)\
|
||||
VTIMESI1(UChi_01,psi_31,TMP)\
|
||||
VTIMESI1(UChi_11,psi_21,TMP)\
|
||||
VTIMESI1(UChi_02,psi_32,TMP)\
|
||||
VTIMESI1(UChi_12,psi_22,TMP)\
|
||||
VTIMESI2(UChi_10,psi_20,TMP)\
|
||||
VTIMESI2(UChi_11,psi_21,TMP)\
|
||||
VTIMESI2(UChi_12,psi_22,TMP)\
|
||||
VTIMESI2(UChi_00,psi_30,TMP)\
|
||||
VTIMESI2(UChi_01,psi_31,TMP)\
|
||||
VTIMESI2(UChi_02,psi_32,TMP)\
|
||||
);
|
||||
|
||||
#define XM_RECON_ACCUM __asm__ ( \
|
||||
VACCTIMESI0(UChi_10,psi_20,Z0)\
|
||||
VACCTIMESI0(UChi_00,psi_30,Z3)\
|
||||
VACCTIMESI0(UChi_11,psi_21,Z1)\
|
||||
VACCTIMESI0(UChi_01,psi_31,Z4)\
|
||||
VACCTIMESI0(UChi_12,psi_22,Z2)\
|
||||
VACCTIMESI0(UChi_02,psi_32,Z5)\
|
||||
\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
\
|
||||
VACCTIMESI1(UChi_10,psi_20,Z0)\
|
||||
VACCTIMESI1(UChi_00,psi_30,Z3)\
|
||||
VACCTIMESI1(UChi_11,psi_21,Z1)\
|
||||
VACCTIMESI1(UChi_01,psi_31,Z4)\
|
||||
VACCTIMESI1(UChi_12,psi_22,Z2)\
|
||||
VACCTIMESI1(UChi_02,psi_32,Z5)\
|
||||
VACCTIMESI2(UChi_10,psi_20,Z0)\
|
||||
VACCTIMESI2(UChi_11,psi_21,Z1)\
|
||||
VACCTIMESI2(UChi_12,psi_22,Z2)\
|
||||
VACCTIMESI2(UChi_00,psi_30,Z3)\
|
||||
VACCTIMESI2(UChi_01,psi_31,Z4)\
|
||||
VACCTIMESI2(UChi_02,psi_32,Z5)\
|
||||
);
|
||||
|
||||
#define YP_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VADD(UChi_10,psi_20,psi_20)\
|
||||
VADD(UChi_11,psi_21,psi_21)\
|
||||
VADD(UChi_12,psi_22,psi_22)\
|
||||
VSUB(UChi_00,psi_30,psi_30)\
|
||||
VSUB(UChi_01,psi_31,psi_31)\
|
||||
VSUB(UChi_02,psi_32,psi_32) );
|
||||
|
||||
#define YM_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VSUB(UChi_10,psi_20,psi_20)\
|
||||
VSUB(UChi_11,psi_21,psi_21)\
|
||||
VSUB(UChi_12,psi_22,psi_22)\
|
||||
VADD(UChi_00,psi_30,psi_30)\
|
||||
VADD(UChi_01,psi_31,psi_31)\
|
||||
VADD(UChi_02,psi_32,psi_32) );
|
||||
|
||||
#define ZP_RECON_ACCUM __asm__ ( \
|
||||
VACCTIMESMINUSI0(UChi_00,psi_20,Z0)\
|
||||
VACCTIMESI0(UChi_10,psi_30,Z3)\
|
||||
VACCTIMESMINUSI0(UChi_01,psi_21,Z1)\
|
||||
VACCTIMESI0(UChi_11,psi_31,Z4)\
|
||||
VACCTIMESMINUSI0(UChi_02,psi_22,Z2)\
|
||||
VACCTIMESI0(UChi_12,psi_32,Z5)\
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VACCTIMESMINUSI1(UChi_00,psi_20,Z0)\
|
||||
VACCTIMESI1(UChi_10,psi_30,Z3)\
|
||||
VACCTIMESMINUSI1(UChi_01,psi_21,Z1)\
|
||||
VACCTIMESI1(UChi_11,psi_31,Z4)\
|
||||
VACCTIMESMINUSI1(UChi_02,psi_22,Z2)\
|
||||
VACCTIMESI1(UChi_12,psi_32,Z5)\
|
||||
VACCTIMESMINUSI2(UChi_00,psi_20,Z0)\
|
||||
VACCTIMESMINUSI2(UChi_01,psi_21,Z1)\
|
||||
VACCTIMESMINUSI2(UChi_02,psi_22,Z2)\
|
||||
VACCTIMESI2(UChi_10,psi_30,Z3)\
|
||||
VACCTIMESI2(UChi_11,psi_31,Z4)\
|
||||
VACCTIMESI2(UChi_12,psi_32,Z5)\
|
||||
);
|
||||
|
||||
#define ZM_RECON_ACCUM __asm__ ( \
|
||||
VACCTIMESI0(UChi_00,psi_20,Z0)\
|
||||
VACCTIMESMINUSI0(UChi_10,psi_30,Z3)\
|
||||
VACCTIMESI0(UChi_01,psi_21,Z1)\
|
||||
VACCTIMESMINUSI0(UChi_11,psi_31,Z4)\
|
||||
VACCTIMESI0(UChi_02,psi_22,Z2)\
|
||||
VACCTIMESMINUSI0(UChi_12,psi_32,Z5)\
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VACCTIMESI1(UChi_00,psi_20,Z0)\
|
||||
VACCTIMESMINUSI1(UChi_10,psi_30,Z3)\
|
||||
VACCTIMESI1(UChi_01,psi_21,Z1)\
|
||||
VACCTIMESMINUSI1(UChi_11,psi_31,Z4)\
|
||||
VACCTIMESI1(UChi_02,psi_22,Z2)\
|
||||
VACCTIMESMINUSI1(UChi_12,psi_32,Z5)\
|
||||
VACCTIMESI2(UChi_00,psi_20,Z0)\
|
||||
VACCTIMESI2(UChi_01,psi_21,Z1)\
|
||||
VACCTIMESI2(UChi_02,psi_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_10,psi_30,Z3)\
|
||||
VACCTIMESMINUSI2(UChi_11,psi_31,Z4)\
|
||||
VACCTIMESMINUSI2(UChi_12,psi_32,Z5)\
|
||||
);
|
||||
|
||||
#define TP_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VADD(UChi_00,psi_20,psi_20)\
|
||||
VADD(UChi_10,psi_30,psi_30)\
|
||||
VADD(UChi_01,psi_21,psi_21)\
|
||||
VADD(UChi_11,psi_31,psi_31)\
|
||||
VADD(UChi_02,psi_22,psi_22)\
|
||||
VADD(UChi_12,psi_32,psi_32) );
|
||||
|
||||
#define TM_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VSUB(UChi_00,psi_20,psi_20)\
|
||||
VSUB(UChi_10,psi_30,psi_30)\
|
||||
VSUB(UChi_01,psi_21,psi_21)\
|
||||
VSUB(UChi_11,psi_31,psi_31)\
|
||||
VSUB(UChi_02,psi_22,psi_22)\
|
||||
VSUB(UChi_12,psi_32,psi_32) );
|
||||
|
||||
#define AVX512_PF_L1
|
||||
#define AVX512_PF_L2_GAUGE
|
||||
#define AVX512_PF_L2_TABLE
|
||||
#undef AVX512_PF_L2_LINEAR
|
||||
|
||||
#ifdef AVX512_PF_L2_TABLE
|
||||
// P1 Fetches the base pointer for next link into L1 with P1
|
||||
// M1 Fetches the next site pointer into L2
|
||||
#define VPREFETCH_P1(A,B) VPREFETCH1(A,B)
|
||||
#define VPREFETCH_P2(A,B)
|
||||
#define VPREFETCH_M1(A,B) VPREFETCH2(A,B)
|
||||
#define VPREFETCH_M2(A,B)
|
||||
#endif
|
||||
|
||||
#ifdef AVX512_PF_L2_LINEAR
|
||||
#define VPREFETCH_M1(A,B) VPREFETCH1(A,B)
|
||||
#define VPREFETCH_M2(A,B) VPREFETCH2(A,B)
|
||||
#define VPREFETCH_P1(A,B)
|
||||
#define VPREFETCH_P2(A,B)
|
||||
#endif
|
||||
|
||||
#ifdef AVX512_PF_L2_GAUGE
|
||||
#define VPREFETCH_G1(A,B) VPREFETCH1(A,B)
|
||||
#define VPREFETCH_G2(A,B) VPREFETCH2(A,B)
|
||||
#endif
|
||||
|
||||
#define PF_GAUGE(A) \
|
||||
LOAD64(%r8,&U._odata[sU](A)) \
|
||||
__asm__ ( \
|
||||
VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8) \
|
||||
VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8) \
|
||||
);
|
||||
|
||||
#define SAVE_RESULTi(PTR,pf) \
|
||||
LOAD64(%r8,PTR) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
VSTORE(0,%r8,psi_00) VPREFETCH_M1(0,%r9) \
|
||||
VSTORE(1,%r8,psi_01) VPREFETCH_M1(1,%r9) \
|
||||
VSTORE(2,%r8,psi_02) VPREFETCH_M1(2,%r9) \
|
||||
VSTORE(3,%r8,psi_10) VPREFETCH_M1(3,%r9) \
|
||||
VSTORE(4,%r8,psi_11) VPREFETCH_M1(4,%r9) \
|
||||
VSTORE(5,%r8,psi_12) VPREFETCH_M1(5,%r9) \
|
||||
VSTORE(6,%r8,psi_20) VPREFETCH_M1(6,%r9) \
|
||||
VSTORE(7,%r8,psi_21) VPREFETCH_M1(7,%r9) \
|
||||
VSTORE(8,%r8,psi_22) VPREFETCH_M1(8,%r9) \
|
||||
VSTORE(9,%r8,psi_30) VPREFETCH_M1(9,%r9) \
|
||||
VSTORE(10,%r8,psi_31) VPREFETCH_M1(10,%r9) \
|
||||
VSTORE(11,%r8,psi_32) VPREFETCH_M1(11,%r9) \
|
||||
);
|
||||
|
||||
#define ADD_RESULTi(PTR,pf) \
|
||||
LOAD_CHIMU(PTR); \
|
||||
asm(VADD(psi_00,Chimu_00,psi_00) VADD(psi_01,Chimu_01,psi_01) VADD(psi_02,Chimu_02,psi_02) \
|
||||
VADD(psi_10,Chimu_10,psi_10) VADD(psi_11,Chimu_11,psi_11) VADD(psi_12,Chimu_12,psi_12) \
|
||||
VADD(psi_20,Chimu_20,psi_20) VADD(psi_21,Chimu_21,psi_21) VADD(psi_22,Chimu_22,psi_22) \
|
||||
VADD(psi_30,Chimu_30,psi_30) VADD(psi_31,Chimu_31,psi_31) VADD(psi_32,Chimu_32,psi_32) ); \
|
||||
SAVE_RESULT(PTR,pf);
|
||||
|
||||
|
||||
|
||||
#define ADD_RESULTia(PTR,pf) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
VADDMEM(0,%r8,psi_00,psi_00) \
|
||||
VADDMEM(1,%r8,psi_01,psi_01) \
|
||||
VADDMEM(2,%r8,psi_02,psi_02) \
|
||||
VADDMEM(3,%r8,psi_10,psi_10) \
|
||||
VADDMEM(4,%r8,psi_11,psi_11) \
|
||||
VADDMEM(5,%r8,psi_12,psi_12) \
|
||||
VADDMEM(6,%r8,psi_20,psi_20) \
|
||||
VADDMEM(7,%r8,psi_21,psi_21) \
|
||||
VADDMEM(8,%r8,psi_22,psi_22) \
|
||||
VADDMEM(9,%r8,psi_30,psi_30) \
|
||||
VADDMEM(10,%r8,psi_31,psi_31) \
|
||||
VADDMEM(11,%r8,psi_32,psi_32) \
|
||||
VSTORE(0,%r8,psi_00) \
|
||||
VSTORE(1,%r8,psi_01) \
|
||||
VSTORE(2,%r8,psi_02) \
|
||||
VSTORE(3,%r8,psi_10) \
|
||||
VSTORE(4,%r8,psi_11) \
|
||||
VSTORE(5,%r8,psi_12) \
|
||||
VSTORE(6,%r8,psi_20) \
|
||||
VSTORE(7,%r8,psi_21) \
|
||||
VSTORE(8,%r8,psi_22) \
|
||||
VSTORE(9,%r8,psi_30) \
|
||||
VSTORE(10,%r8,psi_31) \
|
||||
VSTORE(11,%r8,psi_32) \
|
||||
);
|
||||
|
||||
|
||||
#ifdef AVX512_PF_L2_TABLE
|
||||
#define PREFETCH_CHIMU(A) \
|
||||
LOAD64(%r9,A) \
|
||||
__asm__ ( \
|
||||
VPREFETCH_P1(0,%r9) \
|
||||
VPREFETCH_P1(1,%r9) \
|
||||
VPREFETCH_P1(2,%r9) \
|
||||
VPREFETCH_P1(3,%r9) \
|
||||
VPREFETCH_P1(4,%r9) \
|
||||
VPREFETCH_P1(5,%r9) \
|
||||
VPREFETCH_P1(6,%r9) \
|
||||
VPREFETCH_P1(7,%r9) \
|
||||
VPREFETCH_P1(8,%r9) \
|
||||
VPREFETCH_P1(9,%r9) \
|
||||
VPREFETCH_P1(10,%r9) \
|
||||
VPREFETCH_P1(11,%r9));
|
||||
|
||||
#else
|
||||
#define PREFETCH_CHIMU(A)
|
||||
#endif
|
||||
|
||||
#define PREFETCH1_CHIMU(A) \
|
||||
LOAD64(%r9,A) \
|
||||
__asm__ ( \
|
||||
VPREFETCH_P1(0,%r9) \
|
||||
VPREFETCH_P1(1,%r9) \
|
||||
VPREFETCH_P1(2,%r9) \
|
||||
VPREFETCH_P1(3,%r9) \
|
||||
VPREFETCH_P1(4,%r9) \
|
||||
VPREFETCH_P1(5,%r9) \
|
||||
VPREFETCH_P1(6,%r9) \
|
||||
VPREFETCH_P1(7,%r9) \
|
||||
VPREFETCH_P1(8,%r9) \
|
||||
VPREFETCH_P1(9,%r9) \
|
||||
VPREFETCH_P1(10,%r9) \
|
||||
VPREFETCH_P1(11,%r9));
|
||||
|
||||
#define PERMUTE_DIR0 __asm__ ( \
|
||||
VPERM0(Chi_00,Chi_00) \
|
||||
VPERM0(Chi_01,Chi_01) \
|
||||
VPERM0(Chi_02,Chi_02) \
|
||||
VPERM0(Chi_10,Chi_10) \
|
||||
VPERM0(Chi_11,Chi_11) \
|
||||
VPERM0(Chi_12,Chi_12) );
|
||||
|
||||
#define PERMUTE_DIR1 __asm__ ( \
|
||||
VPERM1(Chi_00,Chi_00) \
|
||||
VPERM1(Chi_01,Chi_01) \
|
||||
VPERM1(Chi_02,Chi_02) \
|
||||
VPERM1(Chi_10,Chi_10) \
|
||||
VPERM1(Chi_11,Chi_11) \
|
||||
VPERM1(Chi_12,Chi_12));
|
||||
|
||||
#define PERMUTE_DIR2 __asm__ ( \
|
||||
VPERM2(Chi_00,Chi_00) \
|
||||
VPERM2(Chi_01,Chi_01) \
|
||||
VPERM2(Chi_02,Chi_02) \
|
||||
VPERM2(Chi_10,Chi_10) \
|
||||
VPERM2(Chi_11,Chi_11) \
|
||||
VPERM2(Chi_12,Chi_12) );
|
||||
|
||||
#define PERMUTE_DIR3 __asm__ ( \
|
||||
VPERM3(Chi_00,Chi_00) \
|
||||
VPERM3(Chi_01,Chi_01) \
|
||||
VPERM3(Chi_02,Chi_02) \
|
||||
VPERM3(Chi_10,Chi_10) \
|
||||
VPERM3(Chi_11,Chi_11) \
|
||||
VPERM3(Chi_12,Chi_12) );
|
||||
|
||||
|
||||
#define MULT_ADDSUB_2SPIN(ptr,pf) \
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
VPREFETCH_G2(9,%r8) \
|
||||
VPREFETCH_G2(10,%r8) \
|
||||
VPREFETCH_G2(11,%r8) \
|
||||
VPREFETCH_G2(12,%r8) \
|
||||
VPREFETCH_G2(13,%r8) \
|
||||
VPREFETCH_G2(14,%r8) \
|
||||
VPREFETCH_G2(15,%r8) \
|
||||
VPREFETCH_G2(16,%r8) \
|
||||
VPREFETCH_G2(17,%r8) \
|
||||
VSHUF(Chi_00,T1) \
|
||||
VMOVIDUP(0,%r8,Z0 ) \
|
||||
VMOVIDUP(3,%r8,Z1 ) \
|
||||
VMOVIDUP(6,%r8,Z2 ) VSHUF(Chi_10,T2) \
|
||||
/*6*/ \
|
||||
VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \
|
||||
VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \
|
||||
VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \
|
||||
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
|
||||
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
|
||||
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
|
||||
VPREFETCH_M1(0,%r9) \
|
||||
VPREFETCH_M1(1,%r9) \
|
||||
VPREFETCH_M1(2,%r9) \
|
||||
VPREFETCH_M1(3,%r9) \
|
||||
/*18*/ \
|
||||
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
|
||||
VMADDSUB(Z3,Chi_10,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \
|
||||
VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \
|
||||
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
|
||||
VMADDSUB(Z5,Chi_10,UChi_12) \
|
||||
VPREFETCH_M1(4,%r9) \
|
||||
VPREFETCH_M1(5,%r9) \
|
||||
VPREFETCH_M1(6,%r9) \
|
||||
VPREFETCH_M1(7,%r9) \
|
||||
/*28*/ \
|
||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
|
||||
VMADDSUB(Z0,T2,UChi_10) \
|
||||
VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \
|
||||
VMADDSUB(Z1,T2,UChi_11) \
|
||||
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
|
||||
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
|
||||
VPREFETCH2(12,%r9) \
|
||||
VPREFETCH2(13,%r9) \
|
||||
VPREFETCH2(14,%r9) \
|
||||
VPREFETCH2(15,%r9) \
|
||||
VPREFETCH2(16,%r9) \
|
||||
VPREFETCH2(17,%r9) \
|
||||
VPREFETCH2(18,%r9) \
|
||||
VPREFETCH2(19,%r9) \
|
||||
VPREFETCH2(20,%r9) \
|
||||
VPREFETCH2(21,%r9) \
|
||||
VPREFETCH2(22,%r9) \
|
||||
VPREFETCH2(23,%r9) \
|
||||
/*38*/ \
|
||||
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
|
||||
VMADDSUB(Z3,Chi_11,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \
|
||||
VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \
|
||||
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
|
||||
VMADDSUB(Z5,Chi_11,UChi_12) \
|
||||
VPREFETCH_M1(9,%r8) \
|
||||
VPREFETCH_M1(10,%r8) \
|
||||
VPREFETCH_M1(11,%r8) \
|
||||
VPREFETCH_M1(12,%r8) \
|
||||
VPREFETCH_M1(13,%r8) \
|
||||
VPREFETCH_M1(14,%r8) \
|
||||
VPREFETCH_M1(15,%r8) \
|
||||
VPREFETCH_M1(16,%r8) \
|
||||
VPREFETCH_M1(17,%r8) \
|
||||
/*48*/ \
|
||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
|
||||
VMADDSUB(Z0,T2,UChi_10) \
|
||||
VMADDSUB(Z1,T1,UChi_01) \
|
||||
VMADDSUB(Z1,T2,UChi_11) \
|
||||
VMADDSUB(Z2,T1,UChi_02) \
|
||||
VMADDSUB(Z2,T2,UChi_12) \
|
||||
VPREFETCH_M1(8,%r9) \
|
||||
VPREFETCH_M1(9,%r9) \
|
||||
VPREFETCH_M1(10,%r9) \
|
||||
VPREFETCH_M1(11,%r9) \
|
||||
/*55*/ \
|
||||
VMADDSUB(Z3,Chi_02,UChi_00) \
|
||||
VMADDSUB(Z3,Chi_12,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_02,UChi_01) \
|
||||
VMADDSUB(Z4,Chi_12,UChi_11) \
|
||||
VMADDSUB(Z5,Chi_02,UChi_02) \
|
||||
VMADDSUB(Z5,Chi_12,UChi_12) \
|
||||
/*61 insns*/ );
|
||||
|
||||
|
||||
#define MULT_ADDSUB_2SPIN_LS(ptr,pf) \
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
|
||||
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
|
||||
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
|
||||
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
|
||||
VPREFETCH_M1(0,%r9) \
|
||||
VPREFETCH_M1(1,%r9) \
|
||||
VPREFETCH_M1(2,%r9) \
|
||||
VPREFETCH_M1(3,%r9) \
|
||||
/*8*/ \
|
||||
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
|
||||
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
|
||||
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
|
||||
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
|
||||
VPREFETCH_M1(4,%r9) \
|
||||
VPREFETCH_M1(5,%r9) \
|
||||
VPREFETCH_M1(6,%r9) \
|
||||
VPREFETCH_M1(7,%r9) \
|
||||
/*16*/ \
|
||||
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
|
||||
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
|
||||
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
|
||||
VPREFETCH_M1(8,%r9) \
|
||||
VPREFETCH_M1(9,%r9) \
|
||||
VPREFETCH_M1(10,%r9) \
|
||||
VPREFETCH_M1(11,%r9) \
|
||||
/*22*/ \
|
||||
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
|
||||
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
|
||||
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
|
||||
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
|
||||
VPREFETCH_M2(12,%r9) \
|
||||
VPREFETCH_M2(13,%r9) \
|
||||
VPREFETCH_M2(14,%r9) \
|
||||
VPREFETCH_M2(15,%r9) \
|
||||
/*30*/ \
|
||||
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
|
||||
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
|
||||
VPREFETCH_M2(16,%r9) \
|
||||
VPREFETCH_M2(17,%r9) \
|
||||
VPREFETCH_M2(18,%r9) \
|
||||
VPREFETCH_M2(19,%r9) \
|
||||
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
|
||||
/*36*/ \
|
||||
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
||||
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
||||
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
||||
VPREFETCH_M2(20,%r9) \
|
||||
VPREFETCH_M2(21,%r9) \
|
||||
VPREFETCH_M2(22,%r9) \
|
||||
VPREFETCH_M2(23,%r9) \
|
||||
VPREFETCH_G1(2,%r8) \
|
||||
VPREFETCH_G1(3,%r8) \
|
||||
VPREFETCH_G2(4,%r8) \
|
||||
VPREFETCH_G2(5,%r8) \
|
||||
VPREFETCH_G2(6,%r8) \
|
||||
VPREFETCH_G2(7,%r8) \
|
||||
/*42 insns*/ );
|
||||
|
||||
#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
|
||||
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
|
||||
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
|
||||
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
|
||||
/*8*/ \
|
||||
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
|
||||
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
|
||||
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
|
||||
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
|
||||
/*16*/ \
|
||||
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
|
||||
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
|
||||
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
|
||||
/*22*/ \
|
||||
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
|
||||
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
|
||||
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
|
||||
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
|
||||
/*30*/ \
|
||||
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
|
||||
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
|
||||
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
|
||||
/*36*/ \
|
||||
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
||||
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
||||
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
||||
/* VPREFETCH1(2,%r8)*/ \
|
||||
/* VPREFETCH1(3,%r8)*/ \
|
||||
/*42 insns*/ );
|
||||
|
||||
|
||||
#define Z6 Chi_00
|
||||
#define MULT_ADDSUB_2SPIN_NEW(ptr,pf) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
VSHUFMEM(0,%r8,Z0) \
|
||||
VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \
|
||||
VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \
|
||||
VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \
|
||||
VSHUFMEM(3,%r8,Z0) \
|
||||
VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \
|
||||
VSHUFMEM(6,%r8,Z0) \
|
||||
VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \
|
||||
VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \
|
||||
VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \
|
||||
VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \
|
||||
/*11 cycles*/ \
|
||||
VSHUFMEM(1,%r8,Z0) \
|
||||
VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \
|
||||
VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \
|
||||
VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \
|
||||
VSHUFMEM(4,%r8,Z0) \
|
||||
VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \
|
||||
VSHUFMEM(7,%r8,Z0) \
|
||||
VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \
|
||||
VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \
|
||||
VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \
|
||||
VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \
|
||||
/*22 cycles*/ \
|
||||
VSHUFMEM(2,%r8,Z0) \
|
||||
VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \
|
||||
VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \
|
||||
VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \
|
||||
VSHUFMEM(5,%r8,Z0) \
|
||||
VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \
|
||||
VSHUFMEM(8,%r8,Z0) \
|
||||
VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \
|
||||
/*33 cycles*/ \
|
||||
VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \
|
||||
VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \
|
||||
VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \
|
||||
/*stall*/ \
|
||||
/*stall*/ \
|
||||
/*stall*/ \
|
||||
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
|
||||
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
|
||||
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) )
|
||||
|
||||
|
||||
#endif
|
255
Grid/simd/Simd.h
Normal file
255
Grid/simd/Simd.h
Normal file
@ -0,0 +1,255 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/Simd.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: neo <cossu@post.kek.jp>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_SIMD_H
|
||||
#define GRID_SIMD_H
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Define scalar and vector floating point types
|
||||
//
|
||||
// Scalar: RealF, RealD, ComplexF, ComplexD
|
||||
//
|
||||
// Vector: vRealF, vRealD, vComplexF, vComplexD
|
||||
//
|
||||
// Vector types are arch dependent
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
|
||||
#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
|
||||
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
|
||||
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
|
||||
#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)
|
||||
|
||||
#define RotateBit (0x100)
|
||||
|
||||
namespace Grid {
|
||||
|
||||
typedef uint32_t Integer;
|
||||
|
||||
typedef float RealF;
|
||||
typedef double RealD;
|
||||
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
|
||||
typedef RealD Real;
|
||||
#else
|
||||
typedef RealF Real;
|
||||
#endif
|
||||
|
||||
typedef std::complex<RealF> ComplexF;
|
||||
typedef std::complex<RealD> ComplexD;
|
||||
typedef std::complex<Real> Complex;
|
||||
|
||||
inline RealF adj(const RealF & r){ return r; }
|
||||
inline RealF conjugate(const RealF & r){ return r; }
|
||||
inline RealF real(const RealF & r){ return r; }
|
||||
|
||||
inline RealD adj(const RealD & r){ return r; }
|
||||
inline RealD conjugate(const RealD & r){ return r; }
|
||||
inline RealD real(const RealD & r){ return r; }
|
||||
|
||||
inline RealD sqrt(const RealD & r){ return std::sqrt(r); }
|
||||
|
||||
inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }
|
||||
inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
|
||||
inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }
|
||||
inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }
|
||||
|
||||
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; }
|
||||
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
|
||||
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
|
||||
inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
|
||||
|
||||
inline ComplexD Reduce(const ComplexD& r){ return r; }
|
||||
inline ComplexF Reduce(const ComplexF& r){ return r; }
|
||||
inline RealD Reduce(const RealD& r){ return r; }
|
||||
inline RealF Reduce(const RealF& r){ return r; }
|
||||
|
||||
inline RealD toReal(const ComplexD& r){ return real(r); }
|
||||
inline RealF toReal(const ComplexF& r){ return real(r); }
|
||||
inline RealD toReal(const RealD& r){ return r; }
|
||||
inline RealF toReal(const RealF& r){ return r; }
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//Provide support functions for basic real and complex data types required by Grid
|
||||
//Single and double precision versions. Should be able to template this once only.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
|
||||
inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
|
||||
inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
|
||||
inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
|
||||
// conjugate already supported for complex
|
||||
|
||||
inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
|
||||
inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
//conjugate already supported for complex
|
||||
|
||||
inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
|
||||
inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
|
||||
inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
|
||||
inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
|
||||
|
||||
// define projections to real and imaginay parts
|
||||
inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
|
||||
inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
|
||||
inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
|
||||
inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
|
||||
|
||||
// define auxiliary functions for complex computations
|
||||
inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);}
|
||||
inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);}
|
||||
inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
|
||||
inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}
|
||||
|
||||
inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);}
|
||||
inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
|
||||
inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
|
||||
inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
|
||||
|
||||
inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
|
||||
inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
inline void vstream(ComplexF &l, const ComplexF &r){ l=r;}
|
||||
inline void vstream(ComplexD &l, const ComplexD &r){ l=r;}
|
||||
inline void vstream(RealF &l, const RealF &r){ l=r;}
|
||||
inline void vstream(RealD &l, const RealD &r){ l=r;}
|
||||
|
||||
|
||||
class Zero{};
|
||||
static Zero zero;
|
||||
template<class itype> inline void zeroit(itype &arg){ arg=zero;};
|
||||
template<> inline void zeroit(ComplexF &arg){ arg=0; };
|
||||
template<> inline void zeroit(ComplexD &arg){ arg=0; };
|
||||
template<> inline void zeroit(RealF &arg){ arg=0; };
|
||||
template<> inline void zeroit(RealD &arg){ arg=0; };
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// Permute
|
||||
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
||||
// Permute 1 every ABCDEFGH -> CD AB GH EF
|
||||
// Permute 2 every ABCDEFGH -> EFGH ABCD
|
||||
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
|
||||
// Permute 4 possible on half precision @512bit vectors.
|
||||
//
|
||||
// Defined inside SIMD specialization files
|
||||
//////////////////////////////////////////////////////////
|
||||
template<class VectorSIMD>
|
||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm);
|
||||
|
||||
};
|
||||
|
||||
#include <Grid/simd/Grid_vector_types.h>
|
||||
#include <Grid/simd/Grid_vector_unops.h>
|
||||
|
||||
namespace Grid {
|
||||
// Default precision
|
||||
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
|
||||
typedef vRealD vReal;
|
||||
typedef vComplexD vComplex;
|
||||
#else
|
||||
typedef vRealF vReal;
|
||||
typedef vComplexF vComplex;
|
||||
#endif
|
||||
|
||||
|
||||
inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){
|
||||
int nn=vComplexF::Nsimd();
|
||||
std::vector<ComplexF,alignedAllocator<ComplexF> > buf(nn);
|
||||
vstore(o,&buf[0]);
|
||||
stream<<"<";
|
||||
for(int i=0;i<nn;i++){
|
||||
stream<<buf[i];
|
||||
if(i<nn-1) stream<<",";
|
||||
}
|
||||
stream<<">";
|
||||
return stream;
|
||||
}
|
||||
|
||||
inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){
|
||||
int nn=vComplexD::Nsimd();
|
||||
std::vector<ComplexD,alignedAllocator<ComplexD> > buf(nn);
|
||||
vstore(o,&buf[0]);
|
||||
stream<<"<";
|
||||
for(int i=0;i<nn;i++){
|
||||
stream<<buf[i];
|
||||
if(i<nn-1) stream<<",";
|
||||
}
|
||||
stream<<">";
|
||||
return stream;
|
||||
}
|
||||
|
||||
inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){
|
||||
int nn=vRealF::Nsimd();
|
||||
std::vector<RealF,alignedAllocator<RealF> > buf(nn);
|
||||
vstore(o,&buf[0]);
|
||||
stream<<"<";
|
||||
for(int i=0;i<nn;i++){
|
||||
stream<<buf[i];
|
||||
if(i<nn-1) stream<<",";
|
||||
}
|
||||
stream<<">";
|
||||
return stream;
|
||||
}
|
||||
|
||||
inline std::ostream& operator<< (std::ostream& stream, const vRealD &o){
|
||||
int nn=vRealD::Nsimd();
|
||||
std::vector<RealD,alignedAllocator<RealD> > buf(nn);
|
||||
vstore(o,&buf[0]);
|
||||
stream<<"<";
|
||||
for(int i=0;i<nn;i++){
|
||||
stream<<buf[i];
|
||||
if(i<nn-1) stream<<",";
|
||||
}
|
||||
stream<<">";
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<< (std::ostream& stream, const vInteger &o){
|
||||
int nn=vInteger::Nsimd();
|
||||
std::vector<Integer,alignedAllocator<Integer> > buf(nn);
|
||||
vstore(o,&buf[0]);
|
||||
stream<<"<";
|
||||
for(int i=0;i<nn;i++){
|
||||
stream<<buf[i];
|
||||
if(i<nn-1) stream<<",";
|
||||
}
|
||||
stream<<">";
|
||||
return stream;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
#endif
|
37
Grid/simd/l1p.h
Normal file
37
Grid/simd/l1p.h
Normal file
@ -0,0 +1,37 @@
|
||||
#pragma once
|
||||
namespace Grid {
|
||||
// L1p optimisation
|
||||
inline void bgq_l1p_optimisation(int mode)
|
||||
{
|
||||
#ifdef QPX
|
||||
#undef L1P_CFG_PF_USR
|
||||
#define L1P_CFG_PF_USR (0x3fde8000108ll) /* (64 bit reg, 23 bits wide, user/unpriv) */
|
||||
|
||||
uint64_t cfg_pf_usr;
|
||||
if ( mode ) {
|
||||
cfg_pf_usr =
|
||||
L1P_CFG_PF_USR_ifetch_depth(0)
|
||||
| L1P_CFG_PF_USR_ifetch_max_footprint(1)
|
||||
| L1P_CFG_PF_USR_pf_stream_est_on_dcbt
|
||||
| L1P_CFG_PF_USR_pf_stream_establish_enable
|
||||
| L1P_CFG_PF_USR_pf_stream_optimistic
|
||||
| L1P_CFG_PF_USR_pf_adaptive_throttle(0xF) ;
|
||||
// if ( sizeof(Float) == sizeof(double) ) {
|
||||
cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(2)| L1P_CFG_PF_USR_dfetch_max_footprint(3) ;
|
||||
// } else {
|
||||
// cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(1)| L1P_CFG_PF_USR_dfetch_max_footprint(2) ;
|
||||
// }
|
||||
} else {
|
||||
cfg_pf_usr = L1P_CFG_PF_USR_dfetch_depth(1)
|
||||
| L1P_CFG_PF_USR_dfetch_max_footprint(2)
|
||||
| L1P_CFG_PF_USR_ifetch_depth(0)
|
||||
| L1P_CFG_PF_USR_ifetch_max_footprint(1)
|
||||
| L1P_CFG_PF_USR_pf_stream_est_on_dcbt
|
||||
| L1P_CFG_PF_USR_pf_stream_establish_enable
|
||||
| L1P_CFG_PF_USR_pf_stream_optimistic
|
||||
| L1P_CFG_PF_USR_pf_stream_prefetch_enable;
|
||||
}
|
||||
*((uint64_t *)L1P_CFG_PF_USR) = cfg_pf_usr;
|
||||
#endif
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user