1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-10 19:36:56 +01:00

Merge branch 'develop' into feature/hmc_generalise

This commit is contained in:
Guido Cossu
2017-04-05 14:41:04 +01:00
205 changed files with 27899 additions and 3601 deletions

796
lib/simd/BGQQPX.h Normal file
View File

@ -0,0 +1,796 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/BGQQPX.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ASM_BGQ_QPX_H
#define GRID_ASM_BGQ_QPX_H
#include <stddint.h>
/*********************************************************
* Architectural macros
*********************************************************/
#define VLOADf(OFF,PTR,DEST) "qvlfsux " #DEST "," #OFF "," #PTR ") ;\n"
#define VLOADd(OFF,PTR,DEST) "qvlfdux " #DEST "," #OFF "," #PTR ") ;\n"
#define VSTOREf(OFF,PTR,SRC) "qvstfsux " #SRC "," #OFF "," #PTR ") ;\n"
#define VSTOREd(OFF,PTR,SRC) "qvstfdux " #SRC "," #OFF "," #PTR ") ;\n"
#define VSPLATf(A,B,DEST) "qvlfcdxa " #A "," #B "," #DEST ";\n"
#define VSPLATd(A,B,DEST) "qvlfcsxa " #A "," #B "," #DEST ";\n"
#define LOAD64(A,ptr)
#define VZERO(DEST) "qvfclr " #DEST "; \n"
#define VONE (DEST) "qvfset " #DEST "; \n"
#define VNEG (SRC,DEST) "qvfneg " #DEST "," #SRC "; \n"
#define VMOV(A,DEST) "qvfmr " #DEST, "," #A ";\n"
#define VADD(A,B,DEST) "qvfadd " #DEST "," #A "," #B ";\n"
#define VSUB(A,B,DEST) "qvfsub " #DEST "," #A "," #B ";\n"
#define VMUL(A,B,DEST) "qvfmul " #DEST "," #A "," #B ";\n"
#define VMUL_RR_RI(A,B,DEST) "qvfxmul " #DEST "," #A "," #B ";\n"
#define VMADD(A,B,C,DEST) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
#define VMADD_RR_RI(A,B,C,DEST) "qvfxmadd " #DEST "," #A "," #B ","#C ";\n"
#define VMADD_MII_IR(A,B,C,DEST) "qvfxxnpmadd " #DEST "," #A "," #B ","#C ";\n"
#define VMADD_II_MIR(A,B,C,DEST) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
#define CACHE_LOCK (PTR) asm (" dcbtls %%r0, %0 \n" : : "r" (PTR) );
#define CACHE_UNLOCK(PTR) asm (" dcblc %%r0, %0 \n" : : "r" (PTR) );
#define CACHE_FLUSH (PTR) asm (" dcbf %%r0, %0 \n" : : "r" (PTR) );
#define CACHE_TOUCH (PTR) asm (" dcbt %%r0, %0 \n" : : "r" (PTR) );
// Gauge field locking 2 x 9 complex == 18*8 / 16 bytes per link
// This is 144/288 bytes == 4.5; 9 lines
#define MASK_REGS /*NOOP ON BGQ*/
#define PF_GAUGE(A) /*NOOP ON BGQ*/
#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
#define PREFETCH_CHIMU(base) /*NOOP ON BGQ*/
/*********************************************************
* Register definitions
*********************************************************/
#define psi_00 0
#define psi_01 1
#define psi_02 2
#define psi_10 3
#define psi_11 4
#define psi_12 5
#define psi_20 6
#define psi_21 7
#define psi_22 8
#define psi_30 9
#define psi_31 10
#define psi_32 11
#define Chi_00 12
#define Chi_01 13
#define Chi_02 14
#define Chi_10 15
#define Chi_11 16
#define Chi_12 17
#define UChi_00 18
#define UChi_01 19
#define UChi_02 20
#define UChi_10 21
#define UChi_11 22
#define UChi_12 23
#define U0 24
#define U1 25
#define U2 26
#define one 27
#define REP %%r16
#define IMM %%r17
/*Alias regs*/
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_02
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_02
/*********************************************************
* Macro sequences encoding QCD
*********************************************************/
#define LOCK_GAUGE(dir) \
{ \
uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \
for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \
CACHE_LOCK(&byte_addr[i]); \
} \
}
#define UNLOCK_GAUGE(dir) \
{ \
uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \
for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \
CACHE_UNLOCK(&byte_addr[i]); \
} \
}
#define MAYBEPERM(A,B)
#define PERMUTE_DIR3
#define PERMUTE_DIR2
#define PERMUTE_DIR1
#define PERMUTE_DIR0
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
#define MULT_SPIN(ptr,p) { \
uint64_t ub = ((uint64_t)base); \
asm ( \
VLOAD(%0,%3,U0) \
VLOAD(%1,%3,U1) \
VLOAD(%2,%3,U2) \
VMUL_RR_RI(U0,Chi_00,UChi_00) \
VMUL_RR_RI(U1,Chi_00,UChi_01) \
VMUL_RR_RI(U2,Chi_00,UChi_02) \
VMUL_RR_RI(U0,Chi_10,UChi_10) \
VMUL_RR_RI(U1,Chi_10,UChi_11) \
VMUL_RR_RI(U2,Chi_10,UChi_12) \
VMADD_MII_IR(U0,Chi_00,UChi_00,UChi_00) \
VMADD_MII_IR(U1,Chi_00,UChi_01,UChi_01) \
VMADD_MII_IR(U2,Chi_00,UChi_02,UChi_02) \
VMADD_MII_IR(U0,Chi_10,UChi_10,UChi_10) \
VMADD_MII_IR(U1,Chi_10,UChi_11,UChi_11) \
VMADD_MII_IR(U2,Chi_10,UChi_12,UChi_12) \
: : "r" (0), "r" (32*3), "r" (32*6), "r" (ub )); \
asm ( \
VLOAD(%0,%3,U0) \
VLOAD(%1,%3,U1) \
VLOAD(%2,%3,U2) \
VMADD_RR_RI(U0,Chi_01,UChi_00,UChi_00) \
VMADD_RR_RI(U1,Chi_01,UChi_01,UChi_01) \
VMADD_RR_RI(U2,Chi_01,UChi_02,UChi_02) \
VMADD_RR_RI(U0,Chi_11,UChi_10,UChi_10) \
VMADD_RR_RI(U1,Chi_11,UChi_11,UChi_11) \
VMADD_RR_RI(U2,Chi_11,UChi_12,UChi_12) \
VMADD_MII_IR(U0,Chi_01,UChi_00,UChi_00) \
VMADD_MII_IR(U1,Chi_01,UChi_01,UChi_01) \
VMADD_MII_IR(U2,Chi_01,UChi_02,UChi_02) \
VMADD_MII_IR(U0,Chi_11,UChi_10,UChi_10) \
VMADD_MII_IR(U1,Chi_11,UChi_11,UChi_11) \
VMADD_MII_IR(U2,Chi_11,UChi_12,UChi_12) \
: : "r" (32), "r" (32*4), "r" (32*7), "r" (ub )); \
asm ( \
VLOAD(%0,%3,U0) \
VLOAD(%1,%3,U1) \
VLOAD(%2,%3,U2) \
VMADD_RR_RI(U0,Chi_02,UChi_00,UChi_00) \
VMADD_RR_RI(U1,Chi_02,UChi_01,UChi_01) \
VMADD_RR_RI(U2,Chi_02,UChi_02,UChi_02) \
VMADD_RR_RI(U0,Chi_12,UChi_10,UChi_10) \
VMADD_RR_RI(U1,Chi_12,UChi_11,UChi_11) \
VMADD_RR_RI(U2,Chi_12,UChi_12,UChi_12) \
VMADD_MII_IR(U0,Chi_02,UChi_00,UChi_00) \
VMADD_MII_IR(U1,Chi_02,UChi_01,UChi_01) \
VMADD_MII_IR(U2,Chi_02,UChi_02,UChi_02) \
VMADD_MII_IR(U0,Chi_12,UChi_10,UChi_10) \
VMADD_MII_IR(U1,Chi_12,UChi_11,UChi_11) \
VMADD_MII_IR(U2,Chi_12,UChi_12,UChi_12) \
: : "r" (32*2), "r" (32*5), "r" (32*8), "r" (ub )); \
}
#define SAVE_RESULT(base,basep) {\
uint64_t ub = ((uint64_t)base) - 32; \
asm("mr %0,"REP";\n\t" \
"li " IMM ",32;\n\t" \
VSTORE(IMM,REP,psi_00) \
VSTORE(IMM,REP,psi_01) \
VSTORE(IMM,REP,psi_02) \
VSTORE(IMM,REP,psi_10) \
VSTORE(IMM,REP,psi_11) \
VSTORE(IMM,REP,psi_12) \
VSTORE(IMM,REP,psi_20) \
VSTORE(IMM,REP,psi_21) \
VSTORE(IMM,REP,psi_22) \
VSTORE(IMM,REP,psi_30) \
VSTORE(IMM,REP,psi_31) \
VSTORE(IMM,REP,psi_32) \
); \
}
/*
*Annoying BG/Q loads with no immediat indexing and big performance hit
*when second miss to a L1 line occurs
*/
#define LOAD_CHI(base) { \
uint64_t ub = ((uint64_t)base) - 64; \
asm("mr %0,"REP";\n\t" \
"li " IMM ",64;\n\t" \
VLOAD(IMM,REP,Chi_00) \
VLOAD(IMM,REP,Chi_02) \
VLOAD(IMM,REP,Chi_11) : : "r" (ub) ); \
ub = ((uint64_t)base) - 32; \
asm("mr %0,"REP";\n\t" \
"li IMM,64;\n\t" \
VLOAD(IMM,REP,Chimu_01) \
VLOAD(IMM,REP,Chimu_10) \
VLOAD(IMM,REP,Chimu_12) : : "r" (ub) ); \
}
#define LOAD_CHIMU(base) { \
uint64_t ub = ((uint64_t)base) - 64; \
asm("mr %0,"REP";\n\t" \
"li IMM,64;\n\t" \
VLOAD(IMM,REP,Chimu_00) \
VLOAD(IMM,REP,Chimu_02) \
VLOAD(IMM,REP,Chimu_11) \
VLOAD(IMM,REP,Chimu_20) \
VLOAD(IMM,REP,Chimu_22) \
VLOAD(IMM,REP,Chimu_31) : : "r" (ub) ); \
ub = ((uint64_t)base) - 32; \
asm("mr %0,"REP";\n\t" \
"li IMM,64;\n\t" \
VLOAD(IMM,REP,Chimu_01) \
VLOAD(IMM,REP,Chimu_10) \
VLOAD(IMM,REP,Chimu_12) \
VLOAD(IMM,REP,Chimu_21) \
VLOAD(IMM,REP,Chimu_30) \
VLOAD(IMM,REP,Chimu_32) : : "r" (ub) ); \
}
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
#define XP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_MII_IR(one,Chimu_30,Chimu_00,Chi_00) \
VMADD_MII_IR(one,Chimu_31,Chimu_01,Chi_01) \
VMADD_MII_IR(one,Chimu_32,Chimu_02,Chi_02) \
VMADD_MII_IR(one,Chimu_20,Chimu_10,Chi_10) \
VMADD_MII_IR(one,Chimu_21,Chimu_11,Chi_11) \
VMADD_MII_IR(one,Chimu_22,Chimu_12,Chi_12) \
); \
}
#define XM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_II_MIR(one,Chimu_30,Chimu_00,Chi_00) \
VMADD_II_MIR(one,Chimu_31,Chimu_01,Chi_01) \
VMADD_II_MIR(one,Chimu_32,Chimu_02,Chi_02) \
VMADD_II_MIR(one,Chimu_20,Chimu_10,Chi_10) \
VMADD_II_MIR(one,Chimu_21,Chimu_11,Chi_11) \
VMADD_II_MIR(one,Chimu_22,Chimu_12,Chi_12) \
); \
}
// hspin(0)=fspin(0)-fspin(3);
// hspin(1)=fspin(1)+fspin(2);
#define YP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VSUB(Chimu_00,Chimu_00,Chi_30) \
VSUB(Chimu_01,Chimu_01,Chi_31) \
VSUB(Chimu_02,Chimu_02,Chi_32) \
VADD(Chimu_10,Chimu_10,Chi_20) \
VADD(Chimu_11,Chimu_11,Chi_21) \
VADD(Chimu_12,Chimu_12,Chi_22) \
); \
}
#define YM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VADD(Chimu_00,Chimu_00,Chi_30) \
VADD(Chimu_01,Chimu_01,Chi_31) \
VADD(Chimu_02,Chimu_02,Chi_32) \
VSUB(Chimu_10,Chimu_10,Chi_20) \
VSUB(Chimu_11,Chimu_11,Chi_21) \
VSUB(Chimu_12,Chimu_12,Chi_22) \
); \
}
/*Gz
* 0 0 i 0 [0]+-i[2]
* 0 0 0 -i [1]-+i[3]
* -i 0 0 0
* 0 i 0 0
*/
#define ZP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_MII_IR(one,Chimu_20,Chimu_00,Chi_00) \
VMADD_MII_IR(one,Chimu_21,Chimu_01,Chi_01) \
VMADD_MII_IR(one,Chimu_22,Chimu_02,Chi_02) \
VMADD_II_MIR(one,Chimu_30,Chimu_10,Chi_10) \
VMADD_II_MIR(one,Chimu_31,Chimu_11,Chi_11) \
VMADD_II_MIR(one,Chimu_32,Chimu_12,Chi_12) \
); \
}
#define ZM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_II_MIR(one,Chimu_20,Chimu_00,Chi_00) \
VMADD_II_MIR(one,Chimu_21,Chimu_01,Chi_01) \
VMADD_II_MIR(one,Chimu_22,Chimu_02,Chi_02) \
VMADD_MII_IR(one,Chimu_30,Chimu_10,Chi_10) \
VMADD_MII_IR(one,Chimu_31,Chimu_11,Chi_11) \
VMADD_MII_IR(one,Chimu_32,Chimu_12,Chi_12) \
); \
}
/*Gt
* 0 0 1 0 [0]+-[2]
* 0 0 0 1 [1]+-[3]
* 1 0 0 0
* 0 1 0 0
*/
#define TP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VADD(Chimu_00,Chimu_00,Chi_20) \
VADD(Chimu_01,Chimu_01,Chi_21) \
VADD(Chimu_02,Chimu_02,Chi_22) \
VADD(Chimu_10,Chimu_10,Chi_30) \
VADD(Chimu_11,Chimu_11,Chi_31) \
VADD(Chimu_12,Chimu_12,Chi_32) \
); \
}
#define TM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VSUB(Chimu_00,Chimu_00,Chi_20) \
VSUB(Chimu_01,Chimu_01,Chi_21) \
VSUB(Chimu_02,Chimu_02,Chi_22) \
VSUB(Chimu_10,Chimu_10,Chi_30) \
VSUB(Chimu_11,Chimu_11,Chi_31) \
VSUB(Chimu_12,Chimu_12,Chi_32) \
); \
}
/*
fspin(0)=hspin(0);
fspin(1)=hspin(1);
fspin(2)=timesMinusI(hspin(1));
fspin(3)=timesMinusI(hspin(0));
fspin(0)+=hspin(0);
fspin(1)+=hspin(1);
fspin(2)-=timesI(hspin(1));
fspin(3)-=timesI(hspin(0));
*/
#define XP_RECON { \
asm(\
VONE(one)\
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \
VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \
VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \
VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \
VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \
VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \
); \
}
#define XM_RECON { \
asm(\
VONE(one)\
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \
VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \
VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \
VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \
VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \
VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \
); \
}
#define XP_RECON_ACCUM { \
asm(\
VONE(one)\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \
VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \
VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \
VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \
VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \
VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \
); \
}
#define XM_RECON_ACCUM { \
asm(\
VONE(one)\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \
VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \
VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \
VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \
VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \
VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \
); \
}
// fspin(2)+=hspin(1);
// fspin(3)-=hspin(0);
#define YP_RECON_ACCUM {\
asm(\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VADD(psi_20,UChi_10,psi_20) VADD(psi_21,UChi_11,psi_21) VADD(psi_22,UChi_12,psi_22) \
VSUB(psi_30,UChi_00,psi_30) VSUB(psi_31,UChi_01,psi_31) VSUB(psi_32,UChi_02,psi_32) \
);\
}
#define YM_RECON_ACCUM {\
asm(\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VSUB(psi_20,UChi_10,psi_20) VSUB(psi_21,UChi_11,psi_21) VSUB(psi_22,UChi_12,psi_22) \
VADD(psi_30,UChi_00,psi_30) VADD(psi_31,UChi_01,psi_31) VADD(psi_32,UChi_02,psi_32) \
);\
}
// fspin(2)-=timesI(hspin(0));
// fspin(3)+=timesI(hspin(1));
#define ZP_RECON_ACCUM {\
asm(\
VONE(one)\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VMADD_II_MIR(one,UChi_00,psi_20,psi_20) \
VMADD_II_MIR(one,UChi_01,psi_21,psi_21) \
VMADD_II_MIR(one,UChi_02,psi_22,psi_22) \
VMADD_MII_IR(one,UChi_10,psi_30,psi_30) \
VMADD_MII_IR(one,UChi_11,psi_31,psi_31) \
VMADD_MII_IR(one,UChi_12,psi_32,psi_32) \
);\
}
#define ZM_RECON_ACCUM {\
asm(\
VONE(one)\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VMADD_MII_IR(one,UChi_00,psi_20,psi_20) \
VMADD_MII_IR(one,UChi_01,psi_21,psi_21) \
VMADD_MII_IR(one,UChi_02,psi_22,psi_22) \
VMADD_II_MIR(one,UChi_10,psi_30,psi_30) \
VMADD_II_MIR(one,UChi_11,psi_31,psi_31) \
VMADD_II_MIR(one,UChi_12,psi_32,psi_32) \
);\
}
// fspin(2)+=hspin(0);
// fspin(3)+=hspin(1);
#define TP_RECON_ACCUM {\
asm(\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VADD(psi_20,UChi_00,psi_20) VADD(psi_21,UChi_01,psi_21) VADD(psi_22,UChi_02,psi_22) \
VADD(psi_30,UChi_10,psi_30) VADD(psi_31,UChi_11,psi_31) VADD(psi_32,UChi_12,psi_32) \
);\
}
#define TM_RECON_ACCUM {\
asm(\
VONE(one)\
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
VSUB(psi_20,UChi_00,psi_20) VSUB(psi_21,UChi_01,psi_21) VSUB(psi_22,UChi_02,psi_22) \
VSUB(psi_30,UChi_10,psi_30) VSUB(psi_31,UChi_11,psi_31) VSUB(psi_32,UChi_12,psi_32) \
);\
}
uint64_t GetPFInfo(int nent,int plocal);
uint64_t GetInfo(int ptype,int local,int perm,int Xp,int ent,int plocal);
#define COMPLEX_TYPE int;
int signs[4];
void testme(int osites,int ssU)
{
int local,perm, ptype;
uint64_t base;
uint64_t basep;
const uint64_t plocal =(uint64_t) & in._odata[0];
// vComplexF isigns[2] = { signs[0], signs[1] };
//COMPLEX_TYPE is vComplexF of vComplexD depending
//on the chosen precision
COMPLEX_TYPE *isigns = &signs[0];
MASK_REGS;
int nmax=osites;
for(int site=0;site<Ns;site++) {
int sU =ssU;
int ssn=ssU+1;
if(ssn>=nmax) ssn=0;
int sUn=ssn;
for(int s=0;s<Ls;s++) {
ss =sU*Ls+s;
ssn=sUn*Ls+s;
////////////////////////////////
// Xp
////////////////////////////////
int ent=ss*8;// 2*Ndim
int nent=ssn*8;
PF_GAUGE(Xp);
base = GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
PREFETCH1_CHIMU(base);
basep = GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns);
#ifdef KERNEL_DAG
XP_PROJMEM(base);
#else
XM_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR3,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFXP(Xp,basep);
}
LOAD64(%r10,isigns);
#ifdef KERNEL_DAG
XP_RECON;
#else
XM_RECON;
#endif
////////////////////////////////
// Yp
////////////////////////////////
basep = GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
YP_PROJMEM(base);
#else
YM_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR2,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Zp,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFYP(Yp,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
YP_RECON_ACCUM;
#else
YM_RECON_ACCUM;
#endif
////////////////////////////////
// Zp
////////////////////////////////
basep = GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
ZP_PROJMEM(base);
#else
ZM_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR1,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Tp,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFZP(Zp,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
ZP_RECON_ACCUM;
#else
ZM_RECON_ACCUM;
#endif
////////////////////////////////
// Tp
////////////////////////////////
basep = GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
TP_PROJMEM(base);
#else
TM_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR0,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Xm,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFTP(Tp,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
TP_RECON_ACCUM;
#else
TM_RECON_ACCUM;
#endif
////////////////////////////////
// Xm
////////////////////////////////
#ifndef STREAM_STORE
basep= (uint64_t) &out._odata[ss];
#endif
// basep= GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
XM_PROJMEM(base);
#else
XP_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR3,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Ym,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFXM(Xm,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
XM_RECON_ACCUM;
#else
XP_RECON_ACCUM;
#endif
////////////////////////////////
// Ym
////////////////////////////////
basep= GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
YM_PROJMEM(base);
#else
YP_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR2,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Zm,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFYM(Ym,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
YM_RECON_ACCUM;
#else
YP_RECON_ACCUM;
#endif
////////////////////////////////
// Zm
////////////////////////////////
basep= GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
ZM_PROJMEM(base);
#else
ZP_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR1,perm);
} else {
LOAD_CHI(base);
}
base = GetInfo(ptype,local,perm,Tm,ent,plocal); ent++;
PREFETCH_CHIMU(base);
{
MULT_2SPIN_DIR_PFZM(Zm,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
ZM_RECON_ACCUM;
#else
ZP_RECON_ACCUM;
#endif
////////////////////////////////
// Tm
////////////////////////////////
basep= GetPFInfo(nent,plocal); nent++;
if ( local ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
TM_PROJMEM(base);
#else
TP_PROJMEM(base);
#endif
MAYBEPERM(PERMUTE_DIR0,perm);
} else {
LOAD_CHI(base);
}
base= (uint64_t) &out._odata[ss];
#ifndef STREAM_STORE
PREFETCH_CHIMU(base);
#endif
{
MULT_2SPIN_DIR_PFTM(Tm,basep);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
#ifdef KERNEL_DAG
TM_RECON_ACCUM;
#else
TP_RECON_ACCUM;
#endif
basep= GetPFInfo(nent,plocal); nent++;
SAVE_RESULT(base,basep);
}
ssU++;
}
}
#endif

View File

@ -460,9 +460,62 @@ namespace Optimization {
static inline __m256d Permute3(__m256d in){
return in;
};
};
struct Exchange{
// 3210 ordering
static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
//Invertible
//AB CD -> AC BD
//AC BD -> AB CD
out1= _mm256_permute2f128_ps(in1,in2,0x20);
out2= _mm256_permute2f128_ps(in1,in2,0x31);
};
static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
//Invertible
// ABCD EFGH ->ABEF CDGH
// ABEF CDGH ->ABCD EFGH
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
// Invertible ?
// ABCD EFGH -> ACEG BDFH
// ACEG BDFH -> AEBF CGDH
// out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
// out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
// Bollocks; need
// AECG BFDH -> ABCD EFGH
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/
out1= _mm256_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
out2= _mm256_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
};
static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
assert(0);
return;
};
static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
out1= _mm256_permute2f128_pd(in1,in2,0x20);
out2= _mm256_permute2f128_pd(in1,in2,0x31);
return;
};
static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
out1= _mm256_shuffle_pd(in1,in2,0x0);
out2= _mm256_shuffle_pd(in1,in2,0xF);
};
static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
assert(0);
return;
};
static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
assert(0);
return;
};
};
#if defined (AVX2)
#define _mm256_alignr_epi32_grid(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
#define _mm256_alignr_epi64_grid(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)

View File

@ -343,6 +343,52 @@ namespace Optimization {
};
// On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
// On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
// The operation is its own inverse
struct Exchange{
// 3210 ordering
static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
out1= _mm512_shuffle_f32x4(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
out2= _mm512_shuffle_f32x4(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
};
static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
out1= _mm512_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
out2= _mm512_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
};
static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
out1= _mm512_shuffle_f64x2(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
out2= _mm512_shuffle_f64x2(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
};
static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
out1 = _mm512_shuffle_pd(in1,in2,0x00);
out2 = _mm512_shuffle_pd(in1,in2,0xFF);
};
static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
assert(0);
return;
};
};
struct Rotate{

View File

@ -180,6 +180,22 @@ namespace Optimization {
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
#define FLOAT_WRAP_3(fn, pref)\
pref vector4float fn(vector4float a, vector4float b, vector4float c) \
{\
vector4double ad, bd, rd, cd; \
vector4float r;\
\
ad = Vset()(a);\
bd = Vset()(b);\
cd = Vset()(c);\
rd = fn(ad, bd, cd); \
Vstore()(rd, r);\
\
return r;\
}
#define FLOAT_WRAP_2(fn, pref)\
pref vector4float fn(vector4float a, vector4float b)\
{\
@ -259,6 +275,13 @@ namespace Optimization {
}
FLOAT_WRAP_2(operator(), inline)
};
struct MaddRealPart{
// Complex double
inline vector4double operator()(vector4double a, vector4double b,vector4double c){
return vec_xmadd(a, b, c);
}
FLOAT_WRAP_3(operator(), inline)
};
struct MultComplex{
// Complex double
inline vector4double operator()(vector4double a, vector4double b){
@ -368,19 +391,36 @@ namespace Optimization {
};
struct Rotate{
template<int n> static inline vector4double tRotate(vector4double v){
if ( n==1 ) return vec_perm(v, v, vec_gpci(01230));
if ( n==2 ) return vec_perm(v, v, vec_gpci(02301));
if ( n==3 ) return vec_perm(v, v, vec_gpci(03012));
return v;
};
template<int n> static inline vector4float tRotate(vector4float a)
{
vector4double ad, rd;
vector4float r;
ad = Vset()(a);
rd = tRotate<n>(ad);
Vstore()(rd, r);
return r;
};
static inline vector4double rotate(vector4double v, int n){
switch(n){
case 0:
return v;
break;
case 1:
return vec_perm(v, v, vec_gpci(01230));
return tRotate<1>(v);
break;
case 2:
return vec_perm(v, v, vec_gpci(02301));
return tRotate<2>(v);
break;
case 3:
return vec_perm(v, v, vec_gpci(03012));
return tRotate<3>(v);
break;
default: assert(0);
}
@ -389,11 +429,9 @@ namespace Optimization {
static inline vector4float rotate(vector4float v, int n){
vector4double vd, rd;
vector4float r;
vd = Vset()(v);
rd = rotate(vd, n);
Vstore()(rd, r);
return r;
}
};
@ -484,6 +522,7 @@ typedef Optimization::Mult MultSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;

View File

@ -326,7 +326,43 @@ namespace Optimization {
static inline __m128d Permute3(__m128d in){
return in;
};
};
struct Exchange{
// 3210 ordering
static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
};
static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
assert(0);
return;
};
static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
assert(0);
return;
};
static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
out1= _mm_shuffle_pd(in1,in2,0x0);
out2= _mm_shuffle_pd(in1,in2,0x3);
};
static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
assert(0);
return;
};
static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
assert(0);
return;
};
static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
assert(0);
return;
};
};
struct Rotate{

View File

@ -350,6 +350,27 @@ class Grid_simd {
return ret;
}
///////////////////////
// Exchange
// Al Ah , Bl Bh -> Al Bl Ah,Bh
///////////////////////
friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
{
if (n==3) {
Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
// std::cout << " Exchange3 "<< out1<<" "<< out2<<" <- " << in1 << " "<<in2<<std::endl;
} else if(n==2) {
Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
// std::cout << " Exchange2 "<< out1<<" "<< out2<<" <- " << in1 << " "<<in2<<std::endl;
} else if(n==1) {
Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
// std::cout << " Exchange1 "<< out1<<" "<< out2<<" <- " << in1 << " "<<in2<<std::endl;
} else if(n==0) {
Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
// std::cout << " Exchange0 "<< out1<<" "<< out2<<" <- " << in1 << " "<<in2<<std::endl;
}
}
////////////////////////////////////////////////////////////////////
// General permute; assumes vector length is same across
// all subtypes; may not be a good assumption, but could
@ -372,23 +393,11 @@ class Grid_simd {
int dist = perm & 0xF;
y = rotate(b, dist);
return;
}
switch (perm) {
case 3:
permute3(y, b);
break;
case 2:
permute2(y, b);
break;
case 1:
permute1(y, b);
break;
case 0:
permute0(y, b);
break;
default:
assert(0);
}
}
else if(perm==3) permute3(y, b);
else if(perm==2) permute2(y, b);
else if(perm==1) permute1(y, b);
else if(perm==0) permute0(y, b);
}
///////////////////////////////
@ -457,6 +466,8 @@ inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
}
///////////////////////
// Splat
///////////////////////

598
lib/simd/IBM_qpx.h Normal file
View File

@ -0,0 +1,598 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/BGQQPX.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ASM_BGQ_QPX_H
#define GRID_ASM_BGQ_QPX_H
#include <stdint.h>
/*********************************************************
* Register definitions
*********************************************************/
#define psi_00 0
#define psi_01 1
#define psi_02 2
#define psi_10 3
#define psi_11 4
#define psi_12 5
#define psi_20 6
#define psi_21 7
#define psi_22 8
#define psi_30 9
#define psi_31 10
#define psi_32 11
#define Chi_00 12
#define Chi_01 13
#define Chi_02 14
#define Chi_10 15
#define Chi_11 16
#define Chi_12 17
#define UChi_00 18
#define UChi_01 19
#define UChi_02 20
#define UChi_10 21
#define UChi_11 22
#define UChi_12 23
#define U0 24
#define U1 25
#define U2 26
#define one 27
#define perm_reg 28
#define REP %%r16
#define IMM %%r17
#define pREP %r16
#define pIMM %r17
#define PPC_INST_DCBTLS 0x7c00014c
#define PPC_INST_DCBLC 0x7c00030c
#define __PPC_CT(t) (((t) & 0x0f) << 21)
#define ___PPC_RA(a) (((a) & 0x1f) << 16)
#define ___PPC_RB(b) (((b) & 0x1f) << 11)
#define LOCK_SET ".long (" HASH(PPC_INST_DCBTLS) "|" HASH(___PPC_RB(16)) ")\n"
#define LOCK_CLEAR ".long (" HASH(PPC_INST_DCBLC) "|" HASH(___PPC_RB(16)) ")\n"
/*Alias regs for incoming fourspinor on neighbour site*/
#define Chi_20 UChi_00
#define Chi_21 UChi_01
#define Chi_22 UChi_02
#define Chi_30 UChi_10
#define Chi_31 UChi_11
#define Chi_32 UChi_12
/*********************************************************
* Architectural macros
*********************************************************/
#define HASHit(A) #A
#define HASH(A) HASHit(A)
#define LOAD64(A,ptr)
#define MASK_REGS /*NOOP ON BGQ*/
#define PF_GAUGE(A) /*NOOP ON BGQ*/
#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
#define PREFETCH_CHIMU(base) /*NOOP ON BGQ*/
#define VLOADf(OFF,PTR,DEST) "qvlfsx " #DEST "," #PTR "," #OFF " ;\n"
#define VLOADuf(OFF,PTR,DEST) "qvlfsux " #DEST "," #PTR "," #OFF " ;\n"
#define VSTOREf(OFF,PTR,SRC) "qvstfsx " #SRC "," #PTR "," #OFF " ;\n"
#define VSTOREuf(OFF,PTR,SRC) "qvstfsux " #SRC "," #PTR "," #OFF " ;\n"
#define VSPLATf(A,B,DEST) "qvlfcsxa " #DEST "," #A "," #B ";\n"
#define VSIZEf (16)
#define VPERMIi(p) "qvgpci " #p ", 1217;\n"
#define VPERMi(A,p) "qvfperm " #A "," #A "," #A "," #p ";\n"
#define VPERMI(p) VPERMIi(p)
#define VPERM(A,p) VPERMi(A,p)
#define VLOADd(OFF,PTR,DEST) "qvlfdx " #DEST "," #PTR "," #OFF " ;\n"
#define VLOADud(OFF,PTR,DEST) "qvlfdux " #DEST "," #PTR "," #OFF " ;\n"
#define VSTOREd(OFF,PTR,SRC) "qvstfdx " #SRC "," #PTR "," #OFF " ;\n"
#define VSTOREud(OFF,PTR,SRC) "qvstfdux " #SRC "," #PTR "," #OFF " ;\n"
#define VSPLATd(A,B,DEST) "qvlfcdxa " #DEST "," #A "," #B ";\n"
#define VSIZEd (32)
// QPX manual ordering QRT comes first (dest)
#define VZEROi(DEST) "qvfset " #DEST "; \n qvfsub " #DEST "," #DEST "," #DEST ";\n"
#define VONEi(DEST) "qvfset " #DEST "; \n"
#define VMOVi(DEST,A) "qvfmr " #DEST "," #A ";\n"
#define VADDi(DEST,A,B) "qvfadd " #DEST "," #A "," #B ";\n"
#define VSUBi(DEST,A,B) "qvfsub " #DEST "," #A "," #B ";\n"
#define VMULi(DEST,A,B) "qvfmul " #DEST "," #A "," #B ";\n"
#define VMUL_RR_RIi(DEST,A,B) "qvfxmul " #DEST "," #A "," #B ";\n"
#define VMADDi(DEST,A,B,C) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
#define VMADD_RR_RIi(DEST,A,B,C) "qvfxmadd " #DEST "," #A "," #B ","#C ";\n"
#define VMADD_MII_IRi(DEST,A,B,C) "qvfxxnpmadd " #DEST "," #B "," #A ","#C ";\n"
#define VMADD_II_MIRi(DEST,A,B,C) "qvfxxcpnmadd " #DEST "," #B "," #A ","#C ";\n"
#define VZERO(C) VZEROi(C)
#define VONE(C) VONEi(C)
#define VMOV(C,A) VMOVi(C,A)
#define VADD(A,B,C) VADDi(A,B,C)
#define VSUB(A,B,C) VSUBi(A,B,C)
#define VMUL(A,B,C) VMULi(A,B,C)
#define VMUL_RR_RI(A,B,C) VMUL_RR_RIi(A,B,C)
#define VMADD(A,B,C,D) VMADDi(A,B,C,D)
#define VMADD_RR_RI(A,B,C,D) VMADD_RR_RIi(A,B,C,D)
#define VMADD_MII_IR(A,B,C,D) VMADD_MII_IRi(A,B,C,D)
#define VMADD_II_MIR(A,B,C,D) VMADD_II_MIRi(A,B,C,D)
/*********************************************************
* Macro sequences encoding QCD
*********************************************************/
#define LOCK_GAUGE(dir) \
{ \
uint64_t byte_addr = (uint64_t)&U._odata[sU]; \
int count = (sizeof(U._odata[0])+63)/64; \
asm (" mtctr %0 \n" \
" mr " HASH(REP) ", %1\n" \
" li " HASH(IMM) ", 64\n" \
"0:\n" \
LOCK_SET \
" add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \
" bdnz 0b\n" \
: : "b" (count), "b" (byte_addr) ); \
}
#define UNLOCK_GAUGE(dir) \
{ \
uint64_t byte_addr = (uint64_t)&U._odata[sU]; \
int count = (sizeof(U._odata[0])+63)/64; \
asm (" mtctr %0 \n" \
" mr " HASH(REP) ", %1\n" \
" li " HASH(IMM) ", 64\n" \
"0:\n" \
LOCK_CLEAR \
" add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \
" bdnz 0b\n" \
: : "b" (count), "b" (byte_addr) ); \
}
#define ZERO_PSI \
VZERO(psi_00) \
VZERO(psi_01) \
VZERO(psi_02) \
VZERO(psi_10) \
VZERO(psi_11) \
VZERO(psi_12) \
VZERO(psi_20) \
VZERO(psi_21) \
VZERO(psi_22) \
VZERO(psi_30) \
VZERO(psi_31) \
VZERO(psi_32)
#define MULT_2SPIN_QPX_LSd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,16)
#define MULT_2SPIN_QPX_LSf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,8)
#define MULT_2SPIN_QPXd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,32)
#define MULT_2SPIN_QPXf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,16)
#define MULT_2SPIN_QPX_INTERNAL(ptr,p,ULOAD,USKIP) { \
uint64_t ub = ((uint64_t)ptr); \
asm ( \
ULOAD(%0,%3,U0) \
ULOAD(%1,%3,U1) \
ULOAD(%2,%3,U2) \
VMUL_RR_RI(UChi_00,U0,Chi_00) \
VMUL_RR_RI(UChi_01,U1,Chi_00) \
VMUL_RR_RI(UChi_02,U2,Chi_00) \
VMUL_RR_RI(UChi_10,U0,Chi_10) \
VMUL_RR_RI(UChi_11,U1,Chi_10) \
VMUL_RR_RI(UChi_12,U2,Chi_10) \
VMADD_MII_IR(UChi_00,U0,Chi_00,UChi_00) \
VMADD_MII_IR(UChi_01,U1,Chi_00,UChi_01) \
VMADD_MII_IR(UChi_02,U2,Chi_00,UChi_02) \
VMADD_MII_IR(UChi_10,U0,Chi_10,UChi_10) \
VMADD_MII_IR(UChi_11,U1,Chi_10,UChi_11) \
VMADD_MII_IR(UChi_12,U2,Chi_10,UChi_12) \
: : "b" (0), "b" (USKIP*3), "b" (USKIP*6), "b" (ub )); \
asm ( \
ULOAD(%0,%3,U0) \
ULOAD(%1,%3,U1) \
ULOAD(%2,%3,U2) \
VMADD_RR_RI(UChi_00,U0,Chi_01,UChi_00) \
VMADD_RR_RI(UChi_01,U1,Chi_01,UChi_01) \
VMADD_RR_RI(UChi_02,U2,Chi_01,UChi_02) \
VMADD_RR_RI(UChi_10,U0,Chi_11,UChi_10) \
VMADD_RR_RI(UChi_11,U1,Chi_11,UChi_11) \
VMADD_RR_RI(UChi_12,U2,Chi_11,UChi_12) \
VMADD_MII_IR(UChi_00,U0,Chi_01,UChi_00) \
VMADD_MII_IR(UChi_01,U1,Chi_01,UChi_01) \
VMADD_MII_IR(UChi_02,U2,Chi_01,UChi_02) \
VMADD_MII_IR(UChi_10,U0,Chi_11,UChi_10) \
VMADD_MII_IR(UChi_11,U1,Chi_11,UChi_11) \
VMADD_MII_IR(UChi_12,U2,Chi_11,UChi_12) \
: : "b" (USKIP*1), "b" (USKIP*4), "b" (USKIP*7), "b" (ub )); \
asm ( \
ULOAD(%0,%3,U0) \
ULOAD(%1,%3,U1) \
ULOAD(%2,%3,U2) \
VMADD_RR_RI(UChi_00,U0,Chi_02,UChi_00) \
VMADD_RR_RI(UChi_01,U1,Chi_02,UChi_01) \
VMADD_RR_RI(UChi_02,U2,Chi_02,UChi_02) \
VMADD_RR_RI(UChi_10,U0,Chi_12,UChi_10) \
VMADD_RR_RI(UChi_11,U1,Chi_12,UChi_11) \
VMADD_RR_RI(UChi_12,U2,Chi_12,UChi_12) \
VMADD_MII_IR(UChi_00,U0,Chi_02,UChi_00) \
VMADD_MII_IR(UChi_01,U1,Chi_02,UChi_01) \
VMADD_MII_IR(UChi_02,U2,Chi_02,UChi_02) \
VMADD_MII_IR(UChi_10,U0,Chi_12,UChi_10) \
VMADD_MII_IR(UChi_11,U1,Chi_12,UChi_11) \
VMADD_MII_IR(UChi_12,U2,Chi_12,UChi_12) \
: : "b" (USKIP*2), "b" (USKIP*5), "b" (USKIP*8), "b" (ub )); \
}
#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p)
#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
#define SAVE_RESULT(base,basep) {\
uint64_t ub = ((uint64_t)base) - (VSIZE); \
asm("mr " HASH(REP) ", %0;\n" \
"li " HASH(IMM) "," HASH(VSIZE)" ;\n" \
VSTOREu(IMM,REP,psi_00) \
VSTOREu(IMM,REP,psi_01) \
VSTOREu(IMM,REP,psi_02) \
VSTOREu(IMM,REP,psi_10) \
VSTOREu(IMM,REP,psi_11) \
VSTOREu(IMM,REP,psi_12) \
VSTOREu(IMM,REP,psi_20) \
VSTOREu(IMM,REP,psi_21) \
VSTOREu(IMM,REP,psi_22) \
VSTOREu(IMM,REP,psi_30) \
VSTOREu(IMM,REP,psi_31) \
VSTOREu(IMM,REP,psi_32) \
: : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
}
/*
*Annoying BG/Q loads with no immediat indexing and big performance hit
*when second miss to a L1 line occurs
*/
#define LOAD_CHI(base) { \
uint64_t ub = ((uint64_t)base) - (2*VSIZE); \
asm("mr " HASH(REP) ",%0 ;\n" \
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
VLOADu(IMM,REP,Chi_00) \
VLOADu(IMM,REP,Chi_02) \
VLOADu(IMM,REP,Chi_11) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
ub = ((uint64_t)base) - VSIZE; \
asm("mr " HASH(REP) ", %0;\n" \
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
VLOADu(IMM,REP,Chi_01) \
VLOADu(IMM,REP,Chi_10) \
VLOADu(IMM,REP,Chi_12) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
}
#define LOAD_CHIMU(base) { \
uint64_t ub = ((uint64_t)base) - (2*VSIZE); \
asm("mr " HASH(REP) ",%0;\n" \
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
VLOADu(IMM,REP,Chi_00) \
VLOADu(IMM,REP,Chi_02) \
VLOADu(IMM,REP,Chi_11) \
VLOADu(IMM,REP,Chi_20) \
VLOADu(IMM,REP,Chi_22) \
VLOADu(IMM,REP,Chi_31) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
ub = ((uint64_t)base) - VSIZE; \
asm("mr " HASH(REP) ", %0;\n" \
"li " HASH(IMM) ", (2*" HASH(VSIZE) ");\n" \
VLOADu(IMM,REP,Chi_01) \
VLOADu(IMM,REP,Chi_10) \
VLOADu(IMM,REP,Chi_12) \
VLOADu(IMM,REP,Chi_21) \
VLOADu(IMM,REP,Chi_30) \
VLOADu(IMM,REP,Chi_32) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
}
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
#define XP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_MII_IR(Chi_00,one,Chi_30,Chi_00) \
VMADD_MII_IR(Chi_01,one,Chi_31,Chi_01) \
VMADD_MII_IR(Chi_02,one,Chi_32,Chi_02) \
VMADD_MII_IR(Chi_10,one,Chi_20,Chi_10) \
VMADD_MII_IR(Chi_11,one,Chi_21,Chi_11) \
VMADD_MII_IR(Chi_12,one,Chi_22,Chi_12) \
); \
}
#define XM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_II_MIR(Chi_00,one,Chi_30,Chi_00) \
VMADD_II_MIR(Chi_01,one,Chi_31,Chi_01) \
VMADD_II_MIR(Chi_02,one,Chi_32,Chi_02) \
VMADD_II_MIR(Chi_10,one,Chi_20,Chi_10) \
VMADD_II_MIR(Chi_11,one,Chi_21,Chi_11) \
VMADD_II_MIR(Chi_12,one,Chi_22,Chi_12) \
); \
}
// hspin(0)=fspin(0)-fspin(3);
// hspin(1)=fspin(1)+fspin(2);
#define YP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VSUB(Chi_00,Chi_00,Chi_30) \
VSUB(Chi_01,Chi_01,Chi_31) \
VSUB(Chi_02,Chi_02,Chi_32) \
VADD(Chi_10,Chi_10,Chi_20) \
VADD(Chi_11,Chi_11,Chi_21) \
VADD(Chi_12,Chi_12,Chi_22) \
); \
}
#define YM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VADD(Chi_00,Chi_00,Chi_30) \
VADD(Chi_01,Chi_01,Chi_31) \
VADD(Chi_02,Chi_02,Chi_32) \
VSUB(Chi_10,Chi_10,Chi_20) \
VSUB(Chi_11,Chi_11,Chi_21) \
VSUB(Chi_12,Chi_12,Chi_22) ); \
}
/*Gz
* 0 0 i 0 [0]+-i[2]
* 0 0 0 -i [1]-+i[3]
* -i 0 0 0
* 0 i 0 0
*/
#define ZP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_MII_IR(Chi_00,one,Chi_20,Chi_00) \
VMADD_MII_IR(Chi_01,one,Chi_21,Chi_01) \
VMADD_MII_IR(Chi_02,one,Chi_22,Chi_02) \
VMADD_II_MIR(Chi_10,one,Chi_30,Chi_10) \
VMADD_II_MIR(Chi_11,one,Chi_31,Chi_11) \
VMADD_II_MIR(Chi_12,one,Chi_32,Chi_12) \
); \
}
#define ZM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VONE(one) \
VMADD_II_MIR(Chi_00,one,Chi_20,Chi_00) \
VMADD_II_MIR(Chi_01,one,Chi_21,Chi_01) \
VMADD_II_MIR(Chi_02,one,Chi_22,Chi_02) \
VMADD_MII_IR(Chi_10,one,Chi_30,Chi_10) \
VMADD_MII_IR(Chi_11,one,Chi_31,Chi_11) \
VMADD_MII_IR(Chi_12,one,Chi_32,Chi_12) \
); \
}
/*Gt
* 0 0 1 0 [0]+-[2]
* 0 0 0 1 [1]+-[3]
* 1 0 0 0
* 0 1 0 0
*/
#define TP_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VADD(Chi_00,Chi_00,Chi_20) \
VADD(Chi_01,Chi_01,Chi_21) \
VADD(Chi_02,Chi_02,Chi_22) \
VADD(Chi_10,Chi_10,Chi_30) \
VADD(Chi_11,Chi_11,Chi_31) \
VADD(Chi_12,Chi_12,Chi_32) \
); \
}
#define TM_PROJMEM(base) { \
LOAD_CHIMU(base); \
asm ( \
VSUB(Chi_00,Chi_00,Chi_20) \
VSUB(Chi_01,Chi_01,Chi_21) \
VSUB(Chi_02,Chi_02,Chi_22) \
VSUB(Chi_10,Chi_10,Chi_30) \
VSUB(Chi_11,Chi_11,Chi_31) \
VSUB(Chi_12,Chi_12,Chi_32) \
); \
}
/*
fspin(0)=hspin(0);
fspin(1)=hspin(1);
fspin(2)=timesMinusI(hspin(1));
fspin(3)=timesMinusI(hspin(0));
fspin(0)+=hspin(0);
fspin(1)+=hspin(1);
fspin(2)-=timesI(hspin(1));
fspin(3)-=timesI(hspin(0));
*/
#define XP_RECON { \
asm(\
VONE(one)\
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \
VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \
VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \
VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \
VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \
VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \
); \
}
#define XM_RECON { \
asm(\
VONE(one)\
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \
VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \
VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \
VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \
VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \
VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \
); \
}
#define XP_RECON_ACCUM { \
asm(\
VONE(one)\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \
VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \
VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \
VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \
VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \
VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \
); \
}
#define XM_RECON_ACCUM { \
asm(\
VONE(one)\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \
VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \
VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \
VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \
VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \
VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \
); \
}
// fspin(2)+=hspin(1);
// fspin(3)-=hspin(0);
#define YP_RECON_ACCUM {\
asm(\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VADD(psi_20,psi_20,UChi_10) VADD(psi_21,psi_21,UChi_11) VADD(psi_22,psi_22,UChi_12) \
VSUB(psi_30,psi_30,UChi_00) VSUB(psi_31,psi_31,UChi_01) VSUB(psi_32,psi_32,UChi_02) \
);\
}
#define YM_RECON_ACCUM {\
asm(\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VSUB(psi_20,psi_20,UChi_10) VSUB(psi_21,psi_21,UChi_11) VSUB(psi_22,psi_22,UChi_12) \
VADD(psi_30,psi_30,UChi_00) VADD(psi_31,psi_31,UChi_01) VADD(psi_32,psi_32,UChi_02) \
);\
}
// fspin(2)-=timesI(hspin(0));
// fspin(3)+=timesI(hspin(1));
#define ZP_RECON_ACCUM {\
asm(\
VONE(one)\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VMADD_II_MIR(psi_20,one,UChi_00,psi_20) \
VMADD_II_MIR(psi_21,one,UChi_01,psi_21) \
VMADD_II_MIR(psi_22,one,UChi_02,psi_22) \
VMADD_MII_IR(psi_30,one,UChi_10,psi_30) \
VMADD_MII_IR(psi_31,one,UChi_11,psi_31) \
VMADD_MII_IR(psi_32,one,UChi_12,psi_32) \
);\
}
#define ZM_RECON_ACCUM {\
asm(\
VONE(one)\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VMADD_MII_IR(psi_20,one,UChi_00,psi_20) \
VMADD_MII_IR(psi_21,one,UChi_01,psi_21) \
VMADD_MII_IR(psi_22,one,UChi_02,psi_22) \
VMADD_II_MIR(psi_30,one,UChi_10,psi_30) \
VMADD_II_MIR(psi_31,one,UChi_11,psi_31) \
VMADD_II_MIR(psi_32,one,UChi_12,psi_32) \
);\
}
// fspin(2)+=hspin(0);
// fspin(3)+=hspin(1);
#define TP_RECON_ACCUM {\
asm(\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VADD(psi_20,psi_20,UChi_00) VADD(psi_21,psi_21,UChi_01) VADD(psi_22,psi_22,UChi_02) \
VADD(psi_30,psi_30,UChi_10) VADD(psi_31,psi_31,UChi_11) VADD(psi_32,psi_32,UChi_12) \
);\
}
#define TM_RECON_ACCUM {\
asm(\
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
VSUB(psi_20,psi_20,UChi_00) VSUB(psi_21,psi_21,UChi_01) VSUB(psi_22,psi_22,UChi_02) \
VSUB(psi_30,psi_30,UChi_10) VSUB(psi_31,psi_31,UChi_11) VSUB(psi_32,psi_32,UChi_12) \
);\
}
#define ADD_RESULTi(PTR,pf) \
LOAD_CHIMU(PTR) \
asm( \
VADD(psi_00,chi_00,psi_00) VADD(psi_01,chi_01,psi_01) VADD(psi_02,chi_02,psi_02) \
VADD(psi_10,chi_10,psi_10) VADD(psi_11,chi_11,psi_11) VADD(psi_12,chi_12,psi_12) \
VADD(psi_20,chi_20,psi_20) VADD(psi_21,chi_21,psi_21) VADD(psi_22,chi_22,psi_22) \
VADD(psi_30,chi_30,psi_30) VADD(psi_31,chi_31,psi_31) VADD(psi_32,chi_32,psi_32) ); \
SAVE_RESULT(PTR,pf);
#define PERMUTE_DIR3
#define PERMUTE_DIR2
#define PERMUTE_DIR1
#define PERMUTE_DIR0 { \
asm( \
VPERMI(perm_reg) \
VPERM(Chi_00,perm_reg) VPERM(Chi_01,perm_reg) VPERM(Chi_02,perm_reg) \
VPERM(Chi_10,perm_reg) VPERM(Chi_11,perm_reg) VPERM(Chi_12,perm_reg) ); \
}
#endif

46
lib/simd/IBM_qpx_double.h Normal file
View File

@ -0,0 +1,46 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Avx512Asm.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// No guard; ok multi-include
#undef VSIZE
#undef VLOAD
#undef VLOADu
#undef VSPLAT
#undef VSTORE
#undef VSTOREu
#undef MULT_2SPIN_QPX_LS
#undef MULT_2SPIN_QPX
#define VSIZE VSIZEd
#define VLOAD(A,B,C) VLOADd(A,B,C)
#define VLOADu(A,B,C) VLOADud(A,B,C)
#define VSPLAT(A,B,DEST) VSPLATd(A,B,DEST)
#define VSTORE(A,B,C) VSTOREd(A,B,C)
#define VSTOREu(A,B,C) VSTOREud(A,B,C)
#define MULT_2SPIN_QPX_LS(ptr,p) MULT_2SPIN_QPX_LSd(ptr,p)
#define MULT_2SPIN_QPX(ptr,p) MULT_2SPIN_QPXd(ptr,p)

46
lib/simd/IBM_qpx_single.h Normal file
View File

@ -0,0 +1,46 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Avx512Asm.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// No guard; ok multi-include
#undef VSIZE
#undef VLOAD
#undef VLOADu
#undef VSPLAT
#undef VSTORE
#undef VSTOREu
#undef MULT_2SPIN_QPX_LS
#undef MULT_2SPIN_QPX
#define VSIZE VSIZEf
#define VLOAD(A,B,C) VLOADf(A,B,C)
#define VLOADu(A,B,C) VLOADuf(A,B,C)
#define VSPLAT(A,B,DEST) VSPLATf(A,B,DEST)
#define VSTORE(A,B,C) VSTOREf(A,B,C)
#define VSTOREu(A,B,C) VSTOREuf(A,B,C)
#define MULT_2SPIN_QPX_LS(ptr,p) MULT_2SPIN_QPX_LSf(ptr,p)
#define MULT_2SPIN_QPX(ptr,p) MULT_2SPIN_QPXf(ptr,p)

View File

@ -31,21 +31,21 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
//////////////////////////////////////////////////////////////////////////////////////////
// Register allocations for Wilson Kernel are precision indept
//////////////////////////////////////////////////////////////////////////////////////////
#define result_00 %zmm0
#define result_01 %zmm1
#define result_02 %zmm2
#define psi_00 %zmm0
#define psi_01 %zmm1
#define psi_02 %zmm2
#define result_10 %zmm3
#define result_11 %zmm4
#define result_12 %zmm5
#define psi_10 %zmm3
#define psi_11 %zmm4
#define psi_12 %zmm5
#define result_20 %zmm6
#define result_21 %zmm7
#define result_22 %zmm8
#define psi_20 %zmm6
#define psi_21 %zmm7
#define psi_22 %zmm8
#define result_30 %zmm9
#define result_31 %zmm10
#define result_32 %zmm11
#define psi_30 %zmm9
#define psi_31 %zmm10
#define psi_32 %zmm11
#define Chi_00 %zmm12
#define Chi_01 %zmm13
@ -98,34 +98,50 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// a little as some duplication developed during trying different
// variants during optimisation. Could cut back to only those used.
//////////////////////////////////////////////////////////////////
#define LOCK_GAUGE(dir)
#define UNLOCK_GAUGE(dir)
// const SiteSpinor * ptr = & in._odata[offset];
#define LOAD_CHIMU(PTR) LOAD_CHIMUi(PTR)
#define LOAD_CHIMU(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIMUi );
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
#define ADD_RESULT(PT,R) ADD_RESULTi(PT,R)
#define LOAD_CHIMUi \
LOAD_CHIMU01i \
LOAD_CHIMU23i );
#define ZERO_PSI \
asm( VZERO(psi_00) \
VZERO(psi_01) \
VZERO(psi_02) \
VZERO(psi_10) \
VZERO(psi_11) \
VZERO(psi_12) \
VZERO(psi_20) \
VZERO(psi_21) \
VZERO(psi_22) \
VZERO(psi_30) \
VZERO(psi_31) \
VZERO(psi_32));
#define LOAD_CHIMUi \
LOAD_CHIMU01i \
LOAD_CHIMU23i
#define LOAD_CHIMU01i\
VLOAD(0,%r8,Chimu_00) \
VLOAD(1,%r8,Chimu_01) \
VLOAD(2,%r8,Chimu_02) \
VLOAD(3,%r8,Chimu_10) \
VLOAD(4,%r8,Chimu_11) \
VLOAD(5,%r8,Chimu_12)
#define LOAD_CHIMU01i \
VLOAD(0,%r8,Chimu_00) \
VLOAD(1,%r8,Chimu_01) \
VLOAD(2,%r8,Chimu_02) \
VLOAD(3,%r8,Chimu_10) \
VLOAD(4,%r8,Chimu_11) \
VLOAD(5,%r8,Chimu_12)
#define LOAD_CHIMU23i\
VLOAD(6,%r8,Chimu_20) \
VLOAD(7,%r8,Chimu_21) \
VLOAD(8,%r8,Chimu_22) \
VLOAD(9,%r8,Chimu_30) \
VLOAD(10,%r8,Chimu_31) \
VLOAD(11,%r8,Chimu_32)
#define LOAD_CHIMU23i \
VLOAD(6,%r8,Chimu_20) \
VLOAD(7,%r8,Chimu_21) \
VLOAD(8,%r8,Chimu_22) \
VLOAD(9,%r8,Chimu_30) \
VLOAD(10,%r8,Chimu_31) \
VLOAD(11,%r8,Chimu_32)
#define SHUF_CHIMU23i\
VSHUFMEM(6,%r8,Chimu_20) \
@ -135,9 +151,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VSHUFMEM(10,%r8,Chimu_31) \
VSHUFMEM(11,%r8,Chimu_32)
// const SiteHalfSpinor *ptr = &buf[offset];
#define LOAD_CHIi \
VLOAD(0,%r8,Chi_00) \
VLOAD(1,%r8,Chi_01) \
@ -145,7 +158,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VLOAD(3,%r8,Chi_10) \
VLOAD(4,%r8,Chi_11) \
VLOAD(5,%r8,Chi_12)
#define SAVE_UCHIi(PTR) \
LOAD64(%r8,PTR) \
@ -155,8 +167,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VSTORE(2,%r8,UChi_02) \
VSTORE(3,%r8,UChi_10) \
VSTORE(4,%r8,UChi_11) \
VSTORE(5,%r8,UChi_12) \
);
VSTORE(5,%r8,UChi_12) );
#define SAVE_CHIi(PTR) \
LOAD64(%r8,PTR) \
@ -166,33 +177,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VSTORE(2,%r8,Chi_02) \
VSTORE(3,%r8,Chi_10) \
VSTORE(4,%r8,Chi_11) \
VSTORE(5,%r8,Chi_12) \
);
VSTORE(5,%r8,Chi_12) );
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN_PFTP(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN_PFXM(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN_PFYM(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p)
#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
//////////////////////////////////////////////////////////////////
// Dirac algebra
//////////////////////////////////////////////////////////////////
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
#define XP_PROJMEM(PTR) \
@ -257,7 +249,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// hspin(0)=fspin(0)-timesI(fspin(3))
// hspin(1)=fspin(1)-timesI(fspin(2))
#define XM_PROJMEM(PTR) \
LOAD64(%r8,PTR)\
__asm__ ( \
@ -322,226 +313,226 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// fspin(3)=timesMinusI(hspin(0))
#define XP_RECON __asm__ ( \
VZERO(TMP) \
VTIMESMINUSI0(UChi_00,result_30,TMP) \
VTIMESMINUSI0(UChi_10,result_20,TMP) \
VTIMESMINUSI0(UChi_01,result_31,TMP) \
VTIMESMINUSI0(UChi_11,result_21,TMP) \
VTIMESMINUSI0(UChi_02,result_32,TMP) \
VTIMESMINUSI0(UChi_12,result_22,TMP) \
VMOV(UChi_00,result_00) \
VMOV(UChi_10,result_10) \
VMOV(UChi_01,result_01) \
VMOV(UChi_11,result_11) \
VMOV(UChi_02,result_02) \
VMOV(UChi_12,result_12) \
VTIMESMINUSI1(UChi_10,result_20,TMP) \
VTIMESMINUSI1(UChi_11,result_21,TMP) \
VTIMESMINUSI1(UChi_12,result_22,TMP) \
VTIMESMINUSI1(UChi_00,result_30,TMP) \
VTIMESMINUSI1(UChi_01,result_31,TMP) \
VTIMESMINUSI1(UChi_02,result_32,TMP) \
VTIMESMINUSI2(UChi_10,result_20,TMP) \
VTIMESMINUSI2(UChi_11,result_21,TMP) \
VTIMESMINUSI2(UChi_12,result_22,TMP) \
VTIMESMINUSI2(UChi_00,result_30,TMP) \
VTIMESMINUSI2(UChi_01,result_31,TMP) \
VTIMESMINUSI2(UChi_02,result_32,TMP) \
VTIMESMINUSI0(UChi_00,psi_30,TMP) \
VTIMESMINUSI0(UChi_10,psi_20,TMP) \
VTIMESMINUSI0(UChi_01,psi_31,TMP) \
VTIMESMINUSI0(UChi_11,psi_21,TMP) \
VTIMESMINUSI0(UChi_02,psi_32,TMP) \
VTIMESMINUSI0(UChi_12,psi_22,TMP) \
VMOV(UChi_00,psi_00) \
VMOV(UChi_10,psi_10) \
VMOV(UChi_01,psi_01) \
VMOV(UChi_11,psi_11) \
VMOV(UChi_02,psi_02) \
VMOV(UChi_12,psi_12) \
VTIMESMINUSI1(UChi_10,psi_20,TMP) \
VTIMESMINUSI1(UChi_11,psi_21,TMP) \
VTIMESMINUSI1(UChi_12,psi_22,TMP) \
VTIMESMINUSI1(UChi_00,psi_30,TMP) \
VTIMESMINUSI1(UChi_01,psi_31,TMP) \
VTIMESMINUSI1(UChi_02,psi_32,TMP) \
VTIMESMINUSI2(UChi_10,psi_20,TMP) \
VTIMESMINUSI2(UChi_11,psi_21,TMP) \
VTIMESMINUSI2(UChi_12,psi_22,TMP) \
VTIMESMINUSI2(UChi_00,psi_30,TMP) \
VTIMESMINUSI2(UChi_01,psi_31,TMP) \
VTIMESMINUSI2(UChi_02,psi_32,TMP) \
);
// NB could save 6 ops using addsub => 12 cycles
#define XP_RECON_ACCUM __asm__ ( \
VZERO(TMP)\
VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
VADD(UChi_00,result_00,result_00)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
VACCTIMESMINUSI2(UChi_00,result_30,Z3)\
VACCTIMESMINUSI2(UChi_01,result_31,Z4)\
VACCTIMESMINUSI2(UChi_02,result_32,Z5)\
VACCTIMESMINUSI0(UChi_00,psi_30,Z3)\
VACCTIMESMINUSI0(UChi_10,psi_20,Z0)\
VACCTIMESMINUSI0(UChi_01,psi_31,Z4)\
VACCTIMESMINUSI0(UChi_11,psi_21,Z1)\
VACCTIMESMINUSI0(UChi_02,psi_32,Z5)\
VACCTIMESMINUSI0(UChi_12,psi_22,Z2)\
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VACCTIMESMINUSI1(UChi_00,psi_30,Z3)\
VACCTIMESMINUSI1(UChi_10,psi_20,Z0)\
VACCTIMESMINUSI1(UChi_01,psi_31,Z4)\
VACCTIMESMINUSI1(UChi_11,psi_21,Z1)\
VACCTIMESMINUSI1(UChi_02,psi_32,Z5)\
VACCTIMESMINUSI1(UChi_12,psi_22,Z2)\
VACCTIMESMINUSI2(UChi_10,psi_20,Z0)\
VACCTIMESMINUSI2(UChi_11,psi_21,Z1)\
VACCTIMESMINUSI2(UChi_12,psi_22,Z2)\
VACCTIMESMINUSI2(UChi_00,psi_30,Z3)\
VACCTIMESMINUSI2(UChi_01,psi_31,Z4)\
VACCTIMESMINUSI2(UChi_02,psi_32,Z5)\
);
#define XM_RECON __asm__ ( \
VZERO(TMP)\
VTIMESI0(UChi_00,result_30,TMP)\
VTIMESI0(UChi_10,result_20,TMP)\
VTIMESI0(UChi_01,result_31,TMP)\
VTIMESI0(UChi_11,result_21,TMP)\
VTIMESI0(UChi_02,result_32,TMP)\
VTIMESI0(UChi_12,result_22,TMP)\
VMOV(UChi_00,result_00)\
VMOV(UChi_10,result_10)\
VMOV(UChi_01,result_01)\
VMOV(UChi_11,result_11)\
VMOV(UChi_02,result_02)\
VMOV(UChi_12,result_12)\
VTIMESI1(UChi_00,result_30,TMP)\
VTIMESI1(UChi_10,result_20,TMP)\
VTIMESI1(UChi_01,result_31,TMP)\
VTIMESI1(UChi_11,result_21,TMP)\
VTIMESI1(UChi_02,result_32,TMP)\
VTIMESI1(UChi_12,result_22,TMP)\
VTIMESI2(UChi_10,result_20,TMP)\
VTIMESI2(UChi_11,result_21,TMP)\
VTIMESI2(UChi_12,result_22,TMP)\
VTIMESI2(UChi_00,result_30,TMP)\
VTIMESI2(UChi_01,result_31,TMP)\
VTIMESI2(UChi_02,result_32,TMP)\
VTIMESI0(UChi_00,psi_30,TMP)\
VTIMESI0(UChi_10,psi_20,TMP)\
VTIMESI0(UChi_01,psi_31,TMP)\
VTIMESI0(UChi_11,psi_21,TMP)\
VTIMESI0(UChi_02,psi_32,TMP)\
VTIMESI0(UChi_12,psi_22,TMP)\
VMOV(UChi_00,psi_00)\
VMOV(UChi_10,psi_10)\
VMOV(UChi_01,psi_01)\
VMOV(UChi_11,psi_11)\
VMOV(UChi_02,psi_02)\
VMOV(UChi_12,psi_12)\
VTIMESI1(UChi_00,psi_30,TMP)\
VTIMESI1(UChi_10,psi_20,TMP)\
VTIMESI1(UChi_01,psi_31,TMP)\
VTIMESI1(UChi_11,psi_21,TMP)\
VTIMESI1(UChi_02,psi_32,TMP)\
VTIMESI1(UChi_12,psi_22,TMP)\
VTIMESI2(UChi_10,psi_20,TMP)\
VTIMESI2(UChi_11,psi_21,TMP)\
VTIMESI2(UChi_12,psi_22,TMP)\
VTIMESI2(UChi_00,psi_30,TMP)\
VTIMESI2(UChi_01,psi_31,TMP)\
VTIMESI2(UChi_02,psi_32,TMP)\
);
#define XM_RECON_ACCUM __asm__ ( \
VACCTIMESI0(UChi_10,result_20,Z0)\
VACCTIMESI0(UChi_00,result_30,Z3)\
VACCTIMESI0(UChi_11,result_21,Z1)\
VACCTIMESI0(UChi_01,result_31,Z4)\
VACCTIMESI0(UChi_12,result_22,Z2)\
VACCTIMESI0(UChi_02,result_32,Z5)\
VACCTIMESI0(UChi_10,psi_20,Z0)\
VACCTIMESI0(UChi_00,psi_30,Z3)\
VACCTIMESI0(UChi_11,psi_21,Z1)\
VACCTIMESI0(UChi_01,psi_31,Z4)\
VACCTIMESI0(UChi_12,psi_22,Z2)\
VACCTIMESI0(UChi_02,psi_32,Z5)\
\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_00,result_00,result_00)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_12,result_12,result_12)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_12,psi_12,psi_12)\
VADD(UChi_02,psi_02,psi_02)\
\
VACCTIMESI1(UChi_10,result_20,Z0)\
VACCTIMESI1(UChi_00,result_30,Z3)\
VACCTIMESI1(UChi_11,result_21,Z1)\
VACCTIMESI1(UChi_01,result_31,Z4)\
VACCTIMESI1(UChi_12,result_22,Z2)\
VACCTIMESI1(UChi_02,result_32,Z5)\
VACCTIMESI2(UChi_10,result_20,Z0)\
VACCTIMESI2(UChi_11,result_21,Z1)\
VACCTIMESI2(UChi_12,result_22,Z2)\
VACCTIMESI2(UChi_00,result_30,Z3)\
VACCTIMESI2(UChi_01,result_31,Z4)\
VACCTIMESI2(UChi_02,result_32,Z5)\
VACCTIMESI1(UChi_10,psi_20,Z0)\
VACCTIMESI1(UChi_00,psi_30,Z3)\
VACCTIMESI1(UChi_11,psi_21,Z1)\
VACCTIMESI1(UChi_01,psi_31,Z4)\
VACCTIMESI1(UChi_12,psi_22,Z2)\
VACCTIMESI1(UChi_02,psi_32,Z5)\
VACCTIMESI2(UChi_10,psi_20,Z0)\
VACCTIMESI2(UChi_11,psi_21,Z1)\
VACCTIMESI2(UChi_12,psi_22,Z2)\
VACCTIMESI2(UChi_00,psi_30,Z3)\
VACCTIMESI2(UChi_01,psi_31,Z4)\
VACCTIMESI2(UChi_02,psi_32,Z5)\
);
#define YP_RECON_ACCUM __asm__ ( \
VADD(UChi_00,result_00,result_00)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VADD(UChi_10,result_20,result_20)\
VADD(UChi_11,result_21,result_21)\
VADD(UChi_12,result_22,result_22)\
VSUB(UChi_00,result_30,result_30)\
VSUB(UChi_01,result_31,result_31)\
VSUB(UChi_02,result_32,result_32) );
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VADD(UChi_10,psi_20,psi_20)\
VADD(UChi_11,psi_21,psi_21)\
VADD(UChi_12,psi_22,psi_22)\
VSUB(UChi_00,psi_30,psi_30)\
VSUB(UChi_01,psi_31,psi_31)\
VSUB(UChi_02,psi_32,psi_32) );
#define YM_RECON_ACCUM __asm__ ( \
VADD(UChi_00,result_00,result_00)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VSUB(UChi_10,result_20,result_20)\
VSUB(UChi_11,result_21,result_21)\
VSUB(UChi_12,result_22,result_22)\
VADD(UChi_00,result_30,result_30)\
VADD(UChi_01,result_31,result_31)\
VADD(UChi_02,result_32,result_32) );
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VSUB(UChi_10,psi_20,psi_20)\
VSUB(UChi_11,psi_21,psi_21)\
VSUB(UChi_12,psi_22,psi_22)\
VADD(UChi_00,psi_30,psi_30)\
VADD(UChi_01,psi_31,psi_31)\
VADD(UChi_02,psi_32,psi_32) );
#define ZP_RECON_ACCUM __asm__ ( \
VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
VACCTIMESI0(UChi_10,result_30,Z3)\
VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
VACCTIMESI0(UChi_11,result_31,Z4)\
VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
VACCTIMESI0(UChi_12,result_32,Z5)\
VADD(UChi_00,result_00,result_00)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
VACCTIMESI1(UChi_10,result_30,Z3)\
VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
VACCTIMESI1(UChi_11,result_31,Z4)\
VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
VACCTIMESI1(UChi_12,result_32,Z5)\
VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
VACCTIMESMINUSI2(UChi_02,result_22,Z2)\
VACCTIMESI2(UChi_10,result_30,Z3)\
VACCTIMESI2(UChi_11,result_31,Z4)\
VACCTIMESI2(UChi_12,result_32,Z5)\
VACCTIMESMINUSI0(UChi_00,psi_20,Z0)\
VACCTIMESI0(UChi_10,psi_30,Z3)\
VACCTIMESMINUSI0(UChi_01,psi_21,Z1)\
VACCTIMESI0(UChi_11,psi_31,Z4)\
VACCTIMESMINUSI0(UChi_02,psi_22,Z2)\
VACCTIMESI0(UChi_12,psi_32,Z5)\
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VACCTIMESMINUSI1(UChi_00,psi_20,Z0)\
VACCTIMESI1(UChi_10,psi_30,Z3)\
VACCTIMESMINUSI1(UChi_01,psi_21,Z1)\
VACCTIMESI1(UChi_11,psi_31,Z4)\
VACCTIMESMINUSI1(UChi_02,psi_22,Z2)\
VACCTIMESI1(UChi_12,psi_32,Z5)\
VACCTIMESMINUSI2(UChi_00,psi_20,Z0)\
VACCTIMESMINUSI2(UChi_01,psi_21,Z1)\
VACCTIMESMINUSI2(UChi_02,psi_22,Z2)\
VACCTIMESI2(UChi_10,psi_30,Z3)\
VACCTIMESI2(UChi_11,psi_31,Z4)\
VACCTIMESI2(UChi_12,psi_32,Z5)\
);
#define ZM_RECON_ACCUM __asm__ ( \
VACCTIMESI0(UChi_00,result_20,Z0)\
VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
VACCTIMESI0(UChi_01,result_21,Z1)\
VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
VACCTIMESI0(UChi_02,result_22,Z2)\
VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
VADD(UChi_00,result_00,result_00)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VACCTIMESI1(UChi_00,result_20,Z0)\
VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
VACCTIMESI1(UChi_01,result_21,Z1)\
VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
VACCTIMESI1(UChi_02,result_22,Z2)\
VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
VACCTIMESI2(UChi_00,result_20,Z0)\
VACCTIMESI2(UChi_01,result_21,Z1)\
VACCTIMESI2(UChi_02,result_22,Z2)\
VACCTIMESMINUSI2(UChi_10,result_30,Z3)\
VACCTIMESMINUSI2(UChi_11,result_31,Z4)\
VACCTIMESMINUSI2(UChi_12,result_32,Z5)\
VACCTIMESI0(UChi_00,psi_20,Z0)\
VACCTIMESMINUSI0(UChi_10,psi_30,Z3)\
VACCTIMESI0(UChi_01,psi_21,Z1)\
VACCTIMESMINUSI0(UChi_11,psi_31,Z4)\
VACCTIMESI0(UChi_02,psi_22,Z2)\
VACCTIMESMINUSI0(UChi_12,psi_32,Z5)\
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VACCTIMESI1(UChi_00,psi_20,Z0)\
VACCTIMESMINUSI1(UChi_10,psi_30,Z3)\
VACCTIMESI1(UChi_01,psi_21,Z1)\
VACCTIMESMINUSI1(UChi_11,psi_31,Z4)\
VACCTIMESI1(UChi_02,psi_22,Z2)\
VACCTIMESMINUSI1(UChi_12,psi_32,Z5)\
VACCTIMESI2(UChi_00,psi_20,Z0)\
VACCTIMESI2(UChi_01,psi_21,Z1)\
VACCTIMESI2(UChi_02,psi_22,Z2)\
VACCTIMESMINUSI2(UChi_10,psi_30,Z3)\
VACCTIMESMINUSI2(UChi_11,psi_31,Z4)\
VACCTIMESMINUSI2(UChi_12,psi_32,Z5)\
);
#define TP_RECON_ACCUM __asm__ ( \
VADD(UChi_00,result_00,result_00)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VADD(UChi_00,result_20,result_20)\
VADD(UChi_10,result_30,result_30)\
VADD(UChi_01,result_21,result_21)\
VADD(UChi_11,result_31,result_31)\
VADD(UChi_02,result_22,result_22)\
VADD(UChi_12,result_32,result_32) );
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VADD(UChi_00,psi_20,psi_20)\
VADD(UChi_10,psi_30,psi_30)\
VADD(UChi_01,psi_21,psi_21)\
VADD(UChi_11,psi_31,psi_31)\
VADD(UChi_02,psi_22,psi_22)\
VADD(UChi_12,psi_32,psi_32) );
#define TM_RECON_ACCUM __asm__ ( \
VADD(UChi_00,result_00,result_00)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VSUB(UChi_00,result_20,result_20)\
VSUB(UChi_10,result_30,result_30)\
VSUB(UChi_01,result_21,result_21)\
VSUB(UChi_11,result_31,result_31)\
VSUB(UChi_02,result_22,result_22)\
VSUB(UChi_12,result_32,result_32) );
VADD(UChi_00,psi_00,psi_00)\
VADD(UChi_10,psi_10,psi_10)\
VADD(UChi_01,psi_01,psi_01)\
VADD(UChi_11,psi_11,psi_11)\
VADD(UChi_02,psi_02,psi_02)\
VADD(UChi_12,psi_12,psi_12)\
VSUB(UChi_00,psi_20,psi_20)\
VSUB(UChi_10,psi_30,psi_30)\
VSUB(UChi_01,psi_21,psi_21)\
VSUB(UChi_11,psi_31,psi_31)\
VSUB(UChi_02,psi_22,psi_22)\
VSUB(UChi_12,psi_32,psi_32) );
#define AVX512_PF_L1
#define AVX512_PF_L2_GAUGE
@ -580,22 +571,62 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
LOAD64(%r8,PTR) \
LOAD64(%r9,pf) \
__asm__ ( \
VSTORE(0,%r8,result_00) VPREFETCH_M1(0,%r9) \
VSTORE(1,%r8,result_01) VPREFETCH_M1(1,%r9) \
VSTORE(2,%r8,result_02) VPREFETCH_M1(2,%r9) \
VSTORE(3,%r8,result_10) VPREFETCH_M1(3,%r9) \
VSTORE(4,%r8,result_11) VPREFETCH_M1(4,%r9) \
VSTORE(5,%r8,result_12) VPREFETCH_M1(5,%r9) \
VSTORE(6,%r8,result_20) VPREFETCH_M1(6,%r9) \
VSTORE(7,%r8,result_21) VPREFETCH_M1(7,%r9) \
VSTORE(8,%r8,result_22) VPREFETCH_M1(8,%r9) \
VSTORE(9,%r8,result_30) VPREFETCH_M1(9,%r9) \
VSTORE(10,%r8,result_31) VPREFETCH_M1(10,%r9) \
VSTORE(11,%r8,result_32) VPREFETCH_M1(11,%r9) \
VSTORE(0,%r8,psi_00) VPREFETCH_M1(0,%r9) \
VSTORE(1,%r8,psi_01) VPREFETCH_M1(1,%r9) \
VSTORE(2,%r8,psi_02) VPREFETCH_M1(2,%r9) \
VSTORE(3,%r8,psi_10) VPREFETCH_M1(3,%r9) \
VSTORE(4,%r8,psi_11) VPREFETCH_M1(4,%r9) \
VSTORE(5,%r8,psi_12) VPREFETCH_M1(5,%r9) \
VSTORE(6,%r8,psi_20) VPREFETCH_M1(6,%r9) \
VSTORE(7,%r8,psi_21) VPREFETCH_M1(7,%r9) \
VSTORE(8,%r8,psi_22) VPREFETCH_M1(8,%r9) \
VSTORE(9,%r8,psi_30) VPREFETCH_M1(9,%r9) \
VSTORE(10,%r8,psi_31) VPREFETCH_M1(10,%r9) \
VSTORE(11,%r8,psi_32) VPREFETCH_M1(11,%r9) \
);
#define ADD_RESULTi(PTR,pf) \
LOAD_CHIMU(PTR); \
asm(VADD(psi_00,Chimu_00,psi_00) VADD(psi_01,Chimu_01,psi_01) VADD(psi_02,Chimu_02,psi_02) \
VADD(psi_10,Chimu_10,psi_10) VADD(psi_11,Chimu_11,psi_11) VADD(psi_12,Chimu_12,psi_12) \
VADD(psi_20,Chimu_20,psi_20) VADD(psi_21,Chimu_21,psi_21) VADD(psi_22,Chimu_22,psi_22) \
VADD(psi_30,Chimu_30,psi_30) VADD(psi_31,Chimu_31,psi_31) VADD(psi_32,Chimu_32,psi_32) ); \
SAVE_RESULT(PTR,pf);
#define ADD_RESULTia(PTR,pf) \
LOAD64(%r8,PTR) \
__asm__ ( \
VADDMEM(0,%r8,psi_00,psi_00) \
VADDMEM(1,%r8,psi_01,psi_01) \
VADDMEM(2,%r8,psi_02,psi_02) \
VADDMEM(3,%r8,psi_10,psi_10) \
VADDMEM(4,%r8,psi_11,psi_11) \
VADDMEM(5,%r8,psi_12,psi_12) \
VADDMEM(6,%r8,psi_20,psi_20) \
VADDMEM(7,%r8,psi_21,psi_21) \
VADDMEM(8,%r8,psi_22,psi_22) \
VADDMEM(9,%r8,psi_30,psi_30) \
VADDMEM(10,%r8,psi_31,psi_31) \
VADDMEM(11,%r8,psi_32,psi_32) \
VSTORE(0,%r8,psi_00) \
VSTORE(1,%r8,psi_01) \
VSTORE(2,%r8,psi_02) \
VSTORE(3,%r8,psi_10) \
VSTORE(4,%r8,psi_11) \
VSTORE(5,%r8,psi_12) \
VSTORE(6,%r8,psi_20) \
VSTORE(7,%r8,psi_21) \
VSTORE(8,%r8,psi_22) \
VSTORE(9,%r8,psi_30) \
VSTORE(10,%r8,psi_31) \
VSTORE(11,%r8,psi_32) \
);
#ifdef AVX512_PF_L2_TABLE
#define PREFETCH_CHIMU(A) \
#define PREFETCH_CHIMU(A) \
LOAD64(%r9,A) \
__asm__ ( \
VPREFETCH_P1(0,%r9) \

255
lib/simd/Simd.h Normal file
View File

@ -0,0 +1,255 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Simd.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_SIMD_H
#define GRID_SIMD_H
////////////////////////////////////////////////////////////////////////
// Define scalar and vector floating point types
//
// Scalar: RealF, RealD, ComplexF, ComplexD
//
// Vector: vRealF, vRealD, vComplexF, vComplexD
//
// Vector types are arch dependent
////////////////////////////////////////////////////////////////////////
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)
#define RotateBit (0x100)
namespace Grid {
typedef uint32_t Integer;
typedef float RealF;
typedef double RealD;
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
typedef RealD Real;
#else
typedef RealF Real;
#endif
typedef std::complex<RealF> ComplexF;
typedef std::complex<RealD> ComplexD;
typedef std::complex<Real> Complex;
inline RealF adj(const RealF & r){ return r; }
inline RealF conjugate(const RealF & r){ return r; }
inline RealF real(const RealF & r){ return r; }
inline RealD adj(const RealD & r){ return r; }
inline RealD conjugate(const RealD & r){ return r; }
inline RealD real(const RealD & r){ return r; }
inline RealD sqrt(const RealD & r){ return std::sqrt(r); }
inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }
inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }
inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; }
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
inline ComplexD Reduce(const ComplexD& r){ return r; }
inline ComplexF Reduce(const ComplexF& r){ return r; }
inline RealD Reduce(const RealD& r){ return r; }
inline RealF Reduce(const RealF& r){ return r; }
inline RealD toReal(const ComplexD& r){ return real(r); }
inline RealF toReal(const ComplexF& r){ return real(r); }
inline RealD toReal(const RealD& r){ return r; }
inline RealF toReal(const RealF& r){ return r; }
////////////////////////////////////////////////////////////////////////////////
//Provide support functions for basic real and complex data types required by Grid
//Single and double precision versions. Should be able to template this once only.
////////////////////////////////////////////////////////////////////////////////
inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
// conjugate already supported for complex
inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
//conjugate already supported for complex
inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
// define projections to real and imaginay parts
inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
// define auxiliary functions for complex computations
inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);}
inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);}
inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}
inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);}
inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
inline void vstream(ComplexF &l, const ComplexF &r){ l=r;}
inline void vstream(ComplexD &l, const ComplexD &r){ l=r;}
inline void vstream(RealF &l, const RealF &r){ l=r;}
inline void vstream(RealD &l, const RealD &r){ l=r;}
class Zero{};
static Zero zero;
template<class itype> inline void zeroit(itype &arg){ arg=zero;};
template<> inline void zeroit(ComplexF &arg){ arg=0; };
template<> inline void zeroit(ComplexD &arg){ arg=0; };
template<> inline void zeroit(RealF &arg){ arg=0; };
template<> inline void zeroit(RealD &arg){ arg=0; };
//////////////////////////////////////////////////////////
// Permute
// Permute 0 every ABCDEFGH -> BA DC FE HG
// Permute 1 every ABCDEFGH -> CD AB GH EF
// Permute 2 every ABCDEFGH -> EFGH ABCD
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
// Permute 4 possible on half precision @512bit vectors.
//
// Defined inside SIMD specialization files
//////////////////////////////////////////////////////////
template<class VectorSIMD>
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm);
};
#include <Grid/simd/Grid_vector_types.h>
#include <Grid/simd/Grid_vector_unops.h>
namespace Grid {
// Default precision
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
typedef vRealD vReal;
typedef vComplexD vComplex;
#else
typedef vRealF vReal;
typedef vComplexF vComplex;
#endif
inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){
int nn=vComplexF::Nsimd();
std::vector<ComplexF,alignedAllocator<ComplexF> > buf(nn);
vstore(o,&buf[0]);
stream<<"<";
for(int i=0;i<nn;i++){
stream<<buf[i];
if(i<nn-1) stream<<",";
}
stream<<">";
return stream;
}
inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){
int nn=vComplexD::Nsimd();
std::vector<ComplexD,alignedAllocator<ComplexD> > buf(nn);
vstore(o,&buf[0]);
stream<<"<";
for(int i=0;i<nn;i++){
stream<<buf[i];
if(i<nn-1) stream<<",";
}
stream<<">";
return stream;
}
inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){
int nn=vRealF::Nsimd();
std::vector<RealF,alignedAllocator<RealF> > buf(nn);
vstore(o,&buf[0]);
stream<<"<";
for(int i=0;i<nn;i++){
stream<<buf[i];
if(i<nn-1) stream<<",";
}
stream<<">";
return stream;
}
inline std::ostream& operator<< (std::ostream& stream, const vRealD &o){
int nn=vRealD::Nsimd();
std::vector<RealD,alignedAllocator<RealD> > buf(nn);
vstore(o,&buf[0]);
stream<<"<";
for(int i=0;i<nn;i++){
stream<<buf[i];
if(i<nn-1) stream<<",";
}
stream<<">";
return stream;
}
inline std::ostream& operator<< (std::ostream& stream, const vInteger &o){
int nn=vInteger::Nsimd();
std::vector<Integer,alignedAllocator<Integer> > buf(nn);
vstore(o,&buf[0]);
stream<<"<";
for(int i=0;i<nn;i++){
stream<<buf[i];
if(i<nn-1) stream<<",";
}
stream<<">";
return stream;
}
}
#endif