mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-10 19:36:56 +01:00
Merge branch 'develop' into feature/hmc_generalise
This commit is contained in:
796
lib/simd/BGQQPX.h
Normal file
796
lib/simd/BGQQPX.h
Normal file
@ -0,0 +1,796 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/BGQQPX.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_BGQ_QPX_H
|
||||
#define GRID_ASM_BGQ_QPX_H
|
||||
|
||||
#include <stddint.h>
|
||||
|
||||
/*********************************************************
|
||||
* Architectural macros
|
||||
*********************************************************/
|
||||
#define VLOADf(OFF,PTR,DEST) "qvlfsux " #DEST "," #OFF "," #PTR ") ;\n"
|
||||
#define VLOADd(OFF,PTR,DEST) "qvlfdux " #DEST "," #OFF "," #PTR ") ;\n"
|
||||
#define VSTOREf(OFF,PTR,SRC) "qvstfsux " #SRC "," #OFF "," #PTR ") ;\n"
|
||||
#define VSTOREd(OFF,PTR,SRC) "qvstfdux " #SRC "," #OFF "," #PTR ") ;\n"
|
||||
#define VSPLATf(A,B,DEST) "qvlfcdxa " #A "," #B "," #DEST ";\n"
|
||||
#define VSPLATd(A,B,DEST) "qvlfcsxa " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define LOAD64(A,ptr)
|
||||
#define VZERO(DEST) "qvfclr " #DEST "; \n"
|
||||
#define VONE (DEST) "qvfset " #DEST "; \n"
|
||||
#define VNEG (SRC,DEST) "qvfneg " #DEST "," #SRC "; \n"
|
||||
#define VMOV(A,DEST) "qvfmr " #DEST, "," #A ";\n"
|
||||
|
||||
#define VADD(A,B,DEST) "qvfadd " #DEST "," #A "," #B ";\n"
|
||||
#define VSUB(A,B,DEST) "qvfsub " #DEST "," #A "," #B ";\n"
|
||||
#define VMUL(A,B,DEST) "qvfmul " #DEST "," #A "," #B ";\n"
|
||||
#define VMUL_RR_RI(A,B,DEST) "qvfxmul " #DEST "," #A "," #B ";\n"
|
||||
#define VMADD(A,B,C,DEST) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
#define VMADD_RR_RI(A,B,C,DEST) "qvfxmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
#define VMADD_MII_IR(A,B,C,DEST) "qvfxxnpmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
#define VMADD_II_MIR(A,B,C,DEST) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
|
||||
#define CACHE_LOCK (PTR) asm (" dcbtls %%r0, %0 \n" : : "r" (PTR) );
|
||||
#define CACHE_UNLOCK(PTR) asm (" dcblc %%r0, %0 \n" : : "r" (PTR) );
|
||||
#define CACHE_FLUSH (PTR) asm (" dcbf %%r0, %0 \n" : : "r" (PTR) );
|
||||
#define CACHE_TOUCH (PTR) asm (" dcbt %%r0, %0 \n" : : "r" (PTR) );
|
||||
|
||||
// Gauge field locking 2 x 9 complex == 18*8 / 16 bytes per link
|
||||
// This is 144/288 bytes == 4.5; 9 lines
|
||||
#define MASK_REGS /*NOOP ON BGQ*/
|
||||
#define PF_GAUGE(A) /*NOOP ON BGQ*/
|
||||
#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
|
||||
#define PREFETCH_CHIMU(base) /*NOOP ON BGQ*/
|
||||
|
||||
/*********************************************************
|
||||
* Register definitions
|
||||
*********************************************************/
|
||||
#define psi_00 0
|
||||
#define psi_01 1
|
||||
#define psi_02 2
|
||||
|
||||
#define psi_10 3
|
||||
#define psi_11 4
|
||||
#define psi_12 5
|
||||
|
||||
#define psi_20 6
|
||||
#define psi_21 7
|
||||
#define psi_22 8
|
||||
|
||||
#define psi_30 9
|
||||
#define psi_31 10
|
||||
#define psi_32 11
|
||||
|
||||
#define Chi_00 12
|
||||
#define Chi_01 13
|
||||
#define Chi_02 14
|
||||
|
||||
#define Chi_10 15
|
||||
#define Chi_11 16
|
||||
#define Chi_12 17
|
||||
|
||||
#define UChi_00 18
|
||||
#define UChi_01 19
|
||||
#define UChi_02 20
|
||||
|
||||
#define UChi_10 21
|
||||
#define UChi_11 22
|
||||
#define UChi_12 23
|
||||
|
||||
#define U0 24
|
||||
#define U1 25
|
||||
#define U2 26
|
||||
#define one 27
|
||||
|
||||
#define REP %%r16
|
||||
#define IMM %%r17
|
||||
|
||||
/*Alias regs*/
|
||||
#define Chimu_00 Chi_00
|
||||
#define Chimu_01 Chi_01
|
||||
#define Chimu_02 Chi_02
|
||||
#define Chimu_10 Chi_10
|
||||
#define Chimu_11 Chi_11
|
||||
#define Chimu_12 Chi_02
|
||||
#define Chimu_20 UChi_00
|
||||
#define Chimu_21 UChi_01
|
||||
#define Chimu_22 UChi_02
|
||||
#define Chimu_30 UChi_10
|
||||
#define Chimu_31 UChi_11
|
||||
#define Chimu_32 UChi_02
|
||||
|
||||
/*********************************************************
|
||||
* Macro sequences encoding QCD
|
||||
*********************************************************/
|
||||
#define LOCK_GAUGE(dir) \
|
||||
{ \
|
||||
uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \
|
||||
for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \
|
||||
CACHE_LOCK(&byte_addr[i]); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UNLOCK_GAUGE(dir) \
|
||||
{ \
|
||||
uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \
|
||||
for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \
|
||||
CACHE_UNLOCK(&byte_addr[i]); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define MAYBEPERM(A,B)
|
||||
|
||||
#define PERMUTE_DIR3
|
||||
#define PERMUTE_DIR2
|
||||
#define PERMUTE_DIR1
|
||||
#define PERMUTE_DIR0
|
||||
|
||||
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
|
||||
#define MULT_SPIN(ptr,p) { \
|
||||
uint64_t ub = ((uint64_t)base); \
|
||||
asm ( \
|
||||
VLOAD(%0,%3,U0) \
|
||||
VLOAD(%1,%3,U1) \
|
||||
VLOAD(%2,%3,U2) \
|
||||
VMUL_RR_RI(U0,Chi_00,UChi_00) \
|
||||
VMUL_RR_RI(U1,Chi_00,UChi_01) \
|
||||
VMUL_RR_RI(U2,Chi_00,UChi_02) \
|
||||
VMUL_RR_RI(U0,Chi_10,UChi_10) \
|
||||
VMUL_RR_RI(U1,Chi_10,UChi_11) \
|
||||
VMUL_RR_RI(U2,Chi_10,UChi_12) \
|
||||
VMADD_MII_IR(U0,Chi_00,UChi_00,UChi_00) \
|
||||
VMADD_MII_IR(U1,Chi_00,UChi_01,UChi_01) \
|
||||
VMADD_MII_IR(U2,Chi_00,UChi_02,UChi_02) \
|
||||
VMADD_MII_IR(U0,Chi_10,UChi_10,UChi_10) \
|
||||
VMADD_MII_IR(U1,Chi_10,UChi_11,UChi_11) \
|
||||
VMADD_MII_IR(U2,Chi_10,UChi_12,UChi_12) \
|
||||
: : "r" (0), "r" (32*3), "r" (32*6), "r" (ub )); \
|
||||
asm ( \
|
||||
VLOAD(%0,%3,U0) \
|
||||
VLOAD(%1,%3,U1) \
|
||||
VLOAD(%2,%3,U2) \
|
||||
VMADD_RR_RI(U0,Chi_01,UChi_00,UChi_00) \
|
||||
VMADD_RR_RI(U1,Chi_01,UChi_01,UChi_01) \
|
||||
VMADD_RR_RI(U2,Chi_01,UChi_02,UChi_02) \
|
||||
VMADD_RR_RI(U0,Chi_11,UChi_10,UChi_10) \
|
||||
VMADD_RR_RI(U1,Chi_11,UChi_11,UChi_11) \
|
||||
VMADD_RR_RI(U2,Chi_11,UChi_12,UChi_12) \
|
||||
VMADD_MII_IR(U0,Chi_01,UChi_00,UChi_00) \
|
||||
VMADD_MII_IR(U1,Chi_01,UChi_01,UChi_01) \
|
||||
VMADD_MII_IR(U2,Chi_01,UChi_02,UChi_02) \
|
||||
VMADD_MII_IR(U0,Chi_11,UChi_10,UChi_10) \
|
||||
VMADD_MII_IR(U1,Chi_11,UChi_11,UChi_11) \
|
||||
VMADD_MII_IR(U2,Chi_11,UChi_12,UChi_12) \
|
||||
: : "r" (32), "r" (32*4), "r" (32*7), "r" (ub )); \
|
||||
asm ( \
|
||||
VLOAD(%0,%3,U0) \
|
||||
VLOAD(%1,%3,U1) \
|
||||
VLOAD(%2,%3,U2) \
|
||||
VMADD_RR_RI(U0,Chi_02,UChi_00,UChi_00) \
|
||||
VMADD_RR_RI(U1,Chi_02,UChi_01,UChi_01) \
|
||||
VMADD_RR_RI(U2,Chi_02,UChi_02,UChi_02) \
|
||||
VMADD_RR_RI(U0,Chi_12,UChi_10,UChi_10) \
|
||||
VMADD_RR_RI(U1,Chi_12,UChi_11,UChi_11) \
|
||||
VMADD_RR_RI(U2,Chi_12,UChi_12,UChi_12) \
|
||||
VMADD_MII_IR(U0,Chi_02,UChi_00,UChi_00) \
|
||||
VMADD_MII_IR(U1,Chi_02,UChi_01,UChi_01) \
|
||||
VMADD_MII_IR(U2,Chi_02,UChi_02,UChi_02) \
|
||||
VMADD_MII_IR(U0,Chi_12,UChi_10,UChi_10) \
|
||||
VMADD_MII_IR(U1,Chi_12,UChi_11,UChi_11) \
|
||||
VMADD_MII_IR(U2,Chi_12,UChi_12,UChi_12) \
|
||||
: : "r" (32*2), "r" (32*5), "r" (32*8), "r" (ub )); \
|
||||
}
|
||||
|
||||
#define SAVE_RESULT(base,basep) {\
|
||||
uint64_t ub = ((uint64_t)base) - 32; \
|
||||
asm("mr %0,"REP";\n\t" \
|
||||
"li " IMM ",32;\n\t" \
|
||||
VSTORE(IMM,REP,psi_00) \
|
||||
VSTORE(IMM,REP,psi_01) \
|
||||
VSTORE(IMM,REP,psi_02) \
|
||||
VSTORE(IMM,REP,psi_10) \
|
||||
VSTORE(IMM,REP,psi_11) \
|
||||
VSTORE(IMM,REP,psi_12) \
|
||||
VSTORE(IMM,REP,psi_20) \
|
||||
VSTORE(IMM,REP,psi_21) \
|
||||
VSTORE(IMM,REP,psi_22) \
|
||||
VSTORE(IMM,REP,psi_30) \
|
||||
VSTORE(IMM,REP,psi_31) \
|
||||
VSTORE(IMM,REP,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
/*
|
||||
*Annoying BG/Q loads with no immediat indexing and big performance hit
|
||||
*when second miss to a L1 line occurs
|
||||
*/
|
||||
#define LOAD_CHI(base) { \
|
||||
uint64_t ub = ((uint64_t)base) - 64; \
|
||||
asm("mr %0,"REP";\n\t" \
|
||||
"li " IMM ",64;\n\t" \
|
||||
VLOAD(IMM,REP,Chi_00) \
|
||||
VLOAD(IMM,REP,Chi_02) \
|
||||
VLOAD(IMM,REP,Chi_11) : : "r" (ub) ); \
|
||||
ub = ((uint64_t)base) - 32; \
|
||||
asm("mr %0,"REP";\n\t" \
|
||||
"li IMM,64;\n\t" \
|
||||
VLOAD(IMM,REP,Chimu_01) \
|
||||
VLOAD(IMM,REP,Chimu_10) \
|
||||
VLOAD(IMM,REP,Chimu_12) : : "r" (ub) ); \
|
||||
}
|
||||
|
||||
#define LOAD_CHIMU(base) { \
|
||||
uint64_t ub = ((uint64_t)base) - 64; \
|
||||
asm("mr %0,"REP";\n\t" \
|
||||
"li IMM,64;\n\t" \
|
||||
VLOAD(IMM,REP,Chimu_00) \
|
||||
VLOAD(IMM,REP,Chimu_02) \
|
||||
VLOAD(IMM,REP,Chimu_11) \
|
||||
VLOAD(IMM,REP,Chimu_20) \
|
||||
VLOAD(IMM,REP,Chimu_22) \
|
||||
VLOAD(IMM,REP,Chimu_31) : : "r" (ub) ); \
|
||||
ub = ((uint64_t)base) - 32; \
|
||||
asm("mr %0,"REP";\n\t" \
|
||||
"li IMM,64;\n\t" \
|
||||
VLOAD(IMM,REP,Chimu_01) \
|
||||
VLOAD(IMM,REP,Chimu_10) \
|
||||
VLOAD(IMM,REP,Chimu_12) \
|
||||
VLOAD(IMM,REP,Chimu_21) \
|
||||
VLOAD(IMM,REP,Chimu_30) \
|
||||
VLOAD(IMM,REP,Chimu_32) : : "r" (ub) ); \
|
||||
}
|
||||
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
#define XP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_MII_IR(one,Chimu_30,Chimu_00,Chi_00) \
|
||||
VMADD_MII_IR(one,Chimu_31,Chimu_01,Chi_01) \
|
||||
VMADD_MII_IR(one,Chimu_32,Chimu_02,Chi_02) \
|
||||
VMADD_MII_IR(one,Chimu_20,Chimu_10,Chi_10) \
|
||||
VMADD_MII_IR(one,Chimu_21,Chimu_11,Chi_11) \
|
||||
VMADD_MII_IR(one,Chimu_22,Chimu_12,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_II_MIR(one,Chimu_30,Chimu_00,Chi_00) \
|
||||
VMADD_II_MIR(one,Chimu_31,Chimu_01,Chi_01) \
|
||||
VMADD_II_MIR(one,Chimu_32,Chimu_02,Chi_02) \
|
||||
VMADD_II_MIR(one,Chimu_20,Chimu_10,Chi_10) \
|
||||
VMADD_II_MIR(one,Chimu_21,Chimu_11,Chi_11) \
|
||||
VMADD_II_MIR(one,Chimu_22,Chimu_12,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
// hspin(0)=fspin(0)-fspin(3);
|
||||
// hspin(1)=fspin(1)+fspin(2);
|
||||
#define YP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VSUB(Chimu_00,Chimu_00,Chi_30) \
|
||||
VSUB(Chimu_01,Chimu_01,Chi_31) \
|
||||
VSUB(Chimu_02,Chimu_02,Chi_32) \
|
||||
VADD(Chimu_10,Chimu_10,Chi_20) \
|
||||
VADD(Chimu_11,Chimu_11,Chi_21) \
|
||||
VADD(Chimu_12,Chimu_12,Chi_22) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define YM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VADD(Chimu_00,Chimu_00,Chi_30) \
|
||||
VADD(Chimu_01,Chimu_01,Chi_31) \
|
||||
VADD(Chimu_02,Chimu_02,Chi_32) \
|
||||
VSUB(Chimu_10,Chimu_10,Chi_20) \
|
||||
VSUB(Chimu_11,Chimu_11,Chi_21) \
|
||||
VSUB(Chimu_12,Chimu_12,Chi_22) \
|
||||
); \
|
||||
}
|
||||
|
||||
/*Gz
|
||||
* 0 0 i 0 [0]+-i[2]
|
||||
* 0 0 0 -i [1]-+i[3]
|
||||
* -i 0 0 0
|
||||
* 0 i 0 0
|
||||
*/
|
||||
#define ZP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_MII_IR(one,Chimu_20,Chimu_00,Chi_00) \
|
||||
VMADD_MII_IR(one,Chimu_21,Chimu_01,Chi_01) \
|
||||
VMADD_MII_IR(one,Chimu_22,Chimu_02,Chi_02) \
|
||||
VMADD_II_MIR(one,Chimu_30,Chimu_10,Chi_10) \
|
||||
VMADD_II_MIR(one,Chimu_31,Chimu_11,Chi_11) \
|
||||
VMADD_II_MIR(one,Chimu_32,Chimu_12,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define ZM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_II_MIR(one,Chimu_20,Chimu_00,Chi_00) \
|
||||
VMADD_II_MIR(one,Chimu_21,Chimu_01,Chi_01) \
|
||||
VMADD_II_MIR(one,Chimu_22,Chimu_02,Chi_02) \
|
||||
VMADD_MII_IR(one,Chimu_30,Chimu_10,Chi_10) \
|
||||
VMADD_MII_IR(one,Chimu_31,Chimu_11,Chi_11) \
|
||||
VMADD_MII_IR(one,Chimu_32,Chimu_12,Chi_12) \
|
||||
); \
|
||||
}
|
||||
/*Gt
|
||||
* 0 0 1 0 [0]+-[2]
|
||||
* 0 0 0 1 [1]+-[3]
|
||||
* 1 0 0 0
|
||||
* 0 1 0 0
|
||||
*/
|
||||
#define TP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VADD(Chimu_00,Chimu_00,Chi_20) \
|
||||
VADD(Chimu_01,Chimu_01,Chi_21) \
|
||||
VADD(Chimu_02,Chimu_02,Chi_22) \
|
||||
VADD(Chimu_10,Chimu_10,Chi_30) \
|
||||
VADD(Chimu_11,Chimu_11,Chi_31) \
|
||||
VADD(Chimu_12,Chimu_12,Chi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define TM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VSUB(Chimu_00,Chimu_00,Chi_20) \
|
||||
VSUB(Chimu_01,Chimu_01,Chi_21) \
|
||||
VSUB(Chimu_02,Chimu_02,Chi_22) \
|
||||
VSUB(Chimu_10,Chimu_10,Chi_30) \
|
||||
VSUB(Chimu_11,Chimu_11,Chi_31) \
|
||||
VSUB(Chimu_12,Chimu_12,Chi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
/*
|
||||
fspin(0)=hspin(0);
|
||||
fspin(1)=hspin(1);
|
||||
fspin(2)=timesMinusI(hspin(1));
|
||||
fspin(3)=timesMinusI(hspin(0));
|
||||
|
||||
fspin(0)+=hspin(0);
|
||||
fspin(1)+=hspin(1);
|
||||
fspin(2)-=timesI(hspin(1));
|
||||
fspin(3)-=timesI(hspin(0));
|
||||
*/
|
||||
#define XP_RECON { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
|
||||
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
|
||||
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
|
||||
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
|
||||
VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \
|
||||
VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \
|
||||
VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \
|
||||
VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \
|
||||
VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \
|
||||
VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_RECON { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
|
||||
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
|
||||
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
|
||||
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
|
||||
VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \
|
||||
VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \
|
||||
VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \
|
||||
VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \
|
||||
VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \
|
||||
VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XP_RECON_ACCUM { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \
|
||||
VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \
|
||||
VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \
|
||||
VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \
|
||||
VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \
|
||||
VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_RECON_ACCUM { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \
|
||||
VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \
|
||||
VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \
|
||||
VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \
|
||||
VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \
|
||||
VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
// fspin(2)+=hspin(1);
|
||||
// fspin(3)-=hspin(0);
|
||||
#define YP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VADD(psi_20,UChi_10,psi_20) VADD(psi_21,UChi_11,psi_21) VADD(psi_22,UChi_12,psi_22) \
|
||||
VSUB(psi_30,UChi_00,psi_30) VSUB(psi_31,UChi_01,psi_31) VSUB(psi_32,UChi_02,psi_32) \
|
||||
);\
|
||||
}
|
||||
#define YM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VSUB(psi_20,UChi_10,psi_20) VSUB(psi_21,UChi_11,psi_21) VSUB(psi_22,UChi_12,psi_22) \
|
||||
VADD(psi_30,UChi_00,psi_30) VADD(psi_31,UChi_01,psi_31) VADD(psi_32,UChi_02,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
// fspin(2)-=timesI(hspin(0));
|
||||
// fspin(3)+=timesI(hspin(1));
|
||||
#define ZP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VMADD_II_MIR(one,UChi_00,psi_20,psi_20) \
|
||||
VMADD_II_MIR(one,UChi_01,psi_21,psi_21) \
|
||||
VMADD_II_MIR(one,UChi_02,psi_22,psi_22) \
|
||||
VMADD_MII_IR(one,UChi_10,psi_30,psi_30) \
|
||||
VMADD_MII_IR(one,UChi_11,psi_31,psi_31) \
|
||||
VMADD_MII_IR(one,UChi_12,psi_32,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
#define ZM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VMADD_MII_IR(one,UChi_00,psi_20,psi_20) \
|
||||
VMADD_MII_IR(one,UChi_01,psi_21,psi_21) \
|
||||
VMADD_MII_IR(one,UChi_02,psi_22,psi_22) \
|
||||
VMADD_II_MIR(one,UChi_10,psi_30,psi_30) \
|
||||
VMADD_II_MIR(one,UChi_11,psi_31,psi_31) \
|
||||
VMADD_II_MIR(one,UChi_12,psi_32,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
// fspin(2)+=hspin(0);
|
||||
// fspin(3)+=hspin(1);
|
||||
#define TP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VADD(psi_20,UChi_00,psi_20) VADD(psi_21,UChi_01,psi_21) VADD(psi_22,UChi_02,psi_22) \
|
||||
VADD(psi_30,UChi_10,psi_30) VADD(psi_31,UChi_11,psi_31) VADD(psi_32,UChi_12,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
#define TM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \
|
||||
VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \
|
||||
VSUB(psi_20,UChi_00,psi_20) VSUB(psi_21,UChi_01,psi_21) VSUB(psi_22,UChi_02,psi_22) \
|
||||
VSUB(psi_30,UChi_10,psi_30) VSUB(psi_31,UChi_11,psi_31) VSUB(psi_32,UChi_12,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
uint64_t GetPFInfo(int nent,int plocal);
|
||||
uint64_t GetInfo(int ptype,int local,int perm,int Xp,int ent,int plocal);
|
||||
|
||||
#define COMPLEX_TYPE int;
|
||||
int signs[4];
|
||||
|
||||
void testme(int osites,int ssU)
|
||||
{
|
||||
int local,perm, ptype;
|
||||
uint64_t base;
|
||||
uint64_t basep;
|
||||
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||
|
||||
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||
//COMPLEX_TYPE is vComplexF of vComplexD depending
|
||||
//on the chosen precision
|
||||
COMPLEX_TYPE *isigns = &signs[0];
|
||||
|
||||
MASK_REGS;
|
||||
int nmax=osites;
|
||||
for(int site=0;site<Ns;site++) {
|
||||
int sU =ssU;
|
||||
int ssn=ssU+1;
|
||||
if(ssn>=nmax) ssn=0;
|
||||
int sUn=ssn;
|
||||
for(int s=0;s<Ls;s++) {
|
||||
ss =sU*Ls+s;
|
||||
ssn=sUn*Ls+s;
|
||||
////////////////////////////////
|
||||
// Xp
|
||||
////////////////////////////////
|
||||
int ent=ss*8;// 2*Ndim
|
||||
int nent=ssn*8;
|
||||
|
||||
PF_GAUGE(Xp);
|
||||
base = GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
|
||||
PREFETCH1_CHIMU(base);
|
||||
|
||||
basep = GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns);
|
||||
#ifdef KERNEL_DAG
|
||||
XP_PROJMEM(base);
|
||||
#else
|
||||
XM_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR3,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Yp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXP(Xp,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns);
|
||||
#ifdef KERNEL_DAG
|
||||
XP_RECON;
|
||||
#else
|
||||
XM_RECON;
|
||||
#endif
|
||||
////////////////////////////////
|
||||
// Yp
|
||||
////////////////////////////////
|
||||
basep = GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
YP_PROJMEM(base);
|
||||
#else
|
||||
YM_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR2,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Zp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYP(Yp,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
YP_RECON_ACCUM;
|
||||
#else
|
||||
YM_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
////////////////////////////////
|
||||
// Zp
|
||||
////////////////////////////////
|
||||
basep = GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
ZP_PROJMEM(base);
|
||||
#else
|
||||
ZM_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR1,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Tp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZP(Zp,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
ZP_RECON_ACCUM;
|
||||
#else
|
||||
ZM_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
////////////////////////////////
|
||||
// Tp
|
||||
////////////////////////////////
|
||||
basep = GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
TP_PROJMEM(base);
|
||||
#else
|
||||
TM_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR0,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Xm,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTP(Tp,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
TP_RECON_ACCUM;
|
||||
#else
|
||||
TM_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
////////////////////////////////
|
||||
// Xm
|
||||
////////////////////////////////
|
||||
#ifndef STREAM_STORE
|
||||
basep= (uint64_t) &out._odata[ss];
|
||||
#endif
|
||||
// basep= GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
XM_PROJMEM(base);
|
||||
#else
|
||||
XP_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR3,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Ym,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXM(Xm,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
XM_RECON_ACCUM;
|
||||
#else
|
||||
XP_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
////////////////////////////////
|
||||
// Ym
|
||||
////////////////////////////////
|
||||
basep= GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
YM_PROJMEM(base);
|
||||
#else
|
||||
YP_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR2,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Zm,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYM(Ym,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
YM_RECON_ACCUM;
|
||||
#else
|
||||
YP_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
////////////////////////////////
|
||||
// Zm
|
||||
////////////////////////////////
|
||||
basep= GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
ZM_PROJMEM(base);
|
||||
#else
|
||||
ZP_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR1,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base = GetInfo(ptype,local,perm,Tm,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(base);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZM(Zm,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
ZM_RECON_ACCUM;
|
||||
#else
|
||||
ZP_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
////////////////////////////////
|
||||
// Tm
|
||||
////////////////////////////////
|
||||
basep= GetPFInfo(nent,plocal); nent++;
|
||||
if ( local ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
TM_PROJMEM(base);
|
||||
#else
|
||||
TP_PROJMEM(base);
|
||||
#endif
|
||||
MAYBEPERM(PERMUTE_DIR0,perm);
|
||||
} else {
|
||||
LOAD_CHI(base);
|
||||
}
|
||||
base= (uint64_t) &out._odata[ss];
|
||||
#ifndef STREAM_STORE
|
||||
PREFETCH_CHIMU(base);
|
||||
#endif
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTM(Tm,basep);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
#ifdef KERNEL_DAG
|
||||
TM_RECON_ACCUM;
|
||||
#else
|
||||
TP_RECON_ACCUM;
|
||||
#endif
|
||||
|
||||
basep= GetPFInfo(nent,plocal); nent++;
|
||||
SAVE_RESULT(base,basep);
|
||||
|
||||
}
|
||||
ssU++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
@ -460,9 +460,62 @@ namespace Optimization {
|
||||
static inline __m256d Permute3(__m256d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
//Invertible
|
||||
//AB CD -> AC BD
|
||||
//AC BD -> AB CD
|
||||
out1= _mm256_permute2f128_ps(in1,in2,0x20);
|
||||
out2= _mm256_permute2f128_ps(in1,in2,0x31);
|
||||
};
|
||||
static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
//Invertible
|
||||
// ABCD EFGH ->ABEF CDGH
|
||||
// ABEF CDGH ->ABCD EFGH
|
||||
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
// Invertible ?
|
||||
// ABCD EFGH -> ACEG BDFH
|
||||
// ACEG BDFH -> AEBF CGDH
|
||||
// out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
// out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
// Bollocks; need
|
||||
// AECG BFDH -> ABCD EFGH
|
||||
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/
|
||||
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/
|
||||
out1= _mm256_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
out2= _mm256_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
};
|
||||
static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
|
||||
static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
out1= _mm256_permute2f128_pd(in1,in2,0x20);
|
||||
out2= _mm256_permute2f128_pd(in1,in2,0x31);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
out1= _mm256_shuffle_pd(in1,in2,0x0);
|
||||
out2= _mm256_shuffle_pd(in1,in2,0xF);
|
||||
};
|
||||
static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
#if defined (AVX2)
|
||||
#define _mm256_alignr_epi32_grid(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
|
||||
#define _mm256_alignr_epi64_grid(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
|
||||
|
@ -343,6 +343,52 @@ namespace Optimization {
|
||||
|
||||
};
|
||||
|
||||
// On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
|
||||
// On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
|
||||
// The operation is its own inverse
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
out1= _mm512_shuffle_f32x4(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
out2= _mm512_shuffle_f32x4(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
};
|
||||
static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
out1= _mm512_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
out2= _mm512_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
};
|
||||
|
||||
static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
out1= _mm512_shuffle_f64x2(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
out2= _mm512_shuffle_f64x2(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
};
|
||||
static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
out1 = _mm512_shuffle_pd(in1,in2,0x00);
|
||||
out2 = _mm512_shuffle_pd(in1,in2,0xFF);
|
||||
};
|
||||
static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
struct Rotate{
|
||||
|
||||
|
@ -180,6 +180,22 @@ namespace Optimization {
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
|
||||
#define FLOAT_WRAP_3(fn, pref)\
|
||||
pref vector4float fn(vector4float a, vector4float b, vector4float c) \
|
||||
{\
|
||||
vector4double ad, bd, rd, cd; \
|
||||
vector4float r;\
|
||||
\
|
||||
ad = Vset()(a);\
|
||||
bd = Vset()(b);\
|
||||
cd = Vset()(c);\
|
||||
rd = fn(ad, bd, cd); \
|
||||
Vstore()(rd, r);\
|
||||
\
|
||||
return r;\
|
||||
}
|
||||
|
||||
#define FLOAT_WRAP_2(fn, pref)\
|
||||
pref vector4float fn(vector4float a, vector4float b)\
|
||||
{\
|
||||
@ -259,6 +275,13 @@ namespace Optimization {
|
||||
}
|
||||
FLOAT_WRAP_2(operator(), inline)
|
||||
};
|
||||
struct MaddRealPart{
|
||||
// Complex double
|
||||
inline vector4double operator()(vector4double a, vector4double b,vector4double c){
|
||||
return vec_xmadd(a, b, c);
|
||||
}
|
||||
FLOAT_WRAP_3(operator(), inline)
|
||||
};
|
||||
struct MultComplex{
|
||||
// Complex double
|
||||
inline vector4double operator()(vector4double a, vector4double b){
|
||||
@ -368,19 +391,36 @@ namespace Optimization {
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
template<int n> static inline vector4double tRotate(vector4double v){
|
||||
if ( n==1 ) return vec_perm(v, v, vec_gpci(01230));
|
||||
if ( n==2 ) return vec_perm(v, v, vec_gpci(02301));
|
||||
if ( n==3 ) return vec_perm(v, v, vec_gpci(03012));
|
||||
return v;
|
||||
};
|
||||
template<int n> static inline vector4float tRotate(vector4float a)
|
||||
{
|
||||
vector4double ad, rd;
|
||||
vector4float r;
|
||||
ad = Vset()(a);
|
||||
rd = tRotate<n>(ad);
|
||||
Vstore()(rd, r);
|
||||
return r;
|
||||
};
|
||||
|
||||
static inline vector4double rotate(vector4double v, int n){
|
||||
switch(n){
|
||||
case 0:
|
||||
return v;
|
||||
break;
|
||||
case 1:
|
||||
return vec_perm(v, v, vec_gpci(01230));
|
||||
return tRotate<1>(v);
|
||||
break;
|
||||
case 2:
|
||||
return vec_perm(v, v, vec_gpci(02301));
|
||||
return tRotate<2>(v);
|
||||
break;
|
||||
case 3:
|
||||
return vec_perm(v, v, vec_gpci(03012));
|
||||
return tRotate<3>(v);
|
||||
break;
|
||||
default: assert(0);
|
||||
}
|
||||
@ -389,11 +429,9 @@ namespace Optimization {
|
||||
static inline vector4float rotate(vector4float v, int n){
|
||||
vector4double vd, rd;
|
||||
vector4float r;
|
||||
|
||||
vd = Vset()(v);
|
||||
rd = rotate(vd, n);
|
||||
Vstore()(rd, r);
|
||||
|
||||
return r;
|
||||
}
|
||||
};
|
||||
@ -484,6 +522,7 @@ typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::Div DivSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
@ -326,7 +326,43 @@ namespace Optimization {
|
||||
static inline __m128d Permute3(__m128d in){
|
||||
return in;
|
||||
};
|
||||
};
|
||||
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
};
|
||||
static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
|
||||
static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
out1= _mm_shuffle_pd(in1,in2,0x0);
|
||||
out2= _mm_shuffle_pd(in1,in2,0x3);
|
||||
};
|
||||
static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
@ -350,6 +350,27 @@ class Grid_simd {
|
||||
return ret;
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// Exchange
|
||||
// Al Ah , Bl Bh -> Al Bl Ah,Bh
|
||||
///////////////////////
|
||||
friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
|
||||
{
|
||||
if (n==3) {
|
||||
Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
|
||||
// std::cout << " Exchange3 "<< out1<<" "<< out2<<" <- " << in1 << " "<<in2<<std::endl;
|
||||
} else if(n==2) {
|
||||
Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
|
||||
// std::cout << " Exchange2 "<< out1<<" "<< out2<<" <- " << in1 << " "<<in2<<std::endl;
|
||||
} else if(n==1) {
|
||||
Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
|
||||
// std::cout << " Exchange1 "<< out1<<" "<< out2<<" <- " << in1 << " "<<in2<<std::endl;
|
||||
} else if(n==0) {
|
||||
Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
|
||||
// std::cout << " Exchange0 "<< out1<<" "<< out2<<" <- " << in1 << " "<<in2<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General permute; assumes vector length is same across
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
@ -372,23 +393,11 @@ class Grid_simd {
|
||||
int dist = perm & 0xF;
|
||||
y = rotate(b, dist);
|
||||
return;
|
||||
}
|
||||
switch (perm) {
|
||||
case 3:
|
||||
permute3(y, b);
|
||||
break;
|
||||
case 2:
|
||||
permute2(y, b);
|
||||
break;
|
||||
case 1:
|
||||
permute1(y, b);
|
||||
break;
|
||||
case 0:
|
||||
permute0(y, b);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
else if(perm==3) permute3(y, b);
|
||||
else if(perm==2) permute2(y, b);
|
||||
else if(perm==1) permute1(y, b);
|
||||
else if(perm==0) permute0(y, b);
|
||||
}
|
||||
|
||||
///////////////////////////////
|
||||
@ -457,6 +466,8 @@ inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
|
||||
ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
|
||||
}
|
||||
|
||||
|
||||
|
||||
///////////////////////
|
||||
// Splat
|
||||
///////////////////////
|
||||
|
598
lib/simd/IBM_qpx.h
Normal file
598
lib/simd/IBM_qpx.h
Normal file
@ -0,0 +1,598 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/BGQQPX.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_BGQ_QPX_H
|
||||
#define GRID_ASM_BGQ_QPX_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/*********************************************************
|
||||
* Register definitions
|
||||
*********************************************************/
|
||||
#define psi_00 0
|
||||
#define psi_01 1
|
||||
#define psi_02 2
|
||||
|
||||
#define psi_10 3
|
||||
#define psi_11 4
|
||||
#define psi_12 5
|
||||
|
||||
#define psi_20 6
|
||||
#define psi_21 7
|
||||
#define psi_22 8
|
||||
|
||||
#define psi_30 9
|
||||
#define psi_31 10
|
||||
#define psi_32 11
|
||||
|
||||
#define Chi_00 12
|
||||
#define Chi_01 13
|
||||
#define Chi_02 14
|
||||
|
||||
#define Chi_10 15
|
||||
#define Chi_11 16
|
||||
#define Chi_12 17
|
||||
|
||||
#define UChi_00 18
|
||||
#define UChi_01 19
|
||||
#define UChi_02 20
|
||||
|
||||
#define UChi_10 21
|
||||
#define UChi_11 22
|
||||
#define UChi_12 23
|
||||
|
||||
#define U0 24
|
||||
#define U1 25
|
||||
#define U2 26
|
||||
#define one 27
|
||||
#define perm_reg 28
|
||||
|
||||
#define REP %%r16
|
||||
#define IMM %%r17
|
||||
#define pREP %r16
|
||||
#define pIMM %r17
|
||||
|
||||
#define PPC_INST_DCBTLS 0x7c00014c
|
||||
#define PPC_INST_DCBLC 0x7c00030c
|
||||
#define __PPC_CT(t) (((t) & 0x0f) << 21)
|
||||
#define ___PPC_RA(a) (((a) & 0x1f) << 16)
|
||||
#define ___PPC_RB(b) (((b) & 0x1f) << 11)
|
||||
|
||||
#define LOCK_SET ".long (" HASH(PPC_INST_DCBTLS) "|" HASH(___PPC_RB(16)) ")\n"
|
||||
#define LOCK_CLEAR ".long (" HASH(PPC_INST_DCBLC) "|" HASH(___PPC_RB(16)) ")\n"
|
||||
|
||||
/*Alias regs for incoming fourspinor on neighbour site*/
|
||||
#define Chi_20 UChi_00
|
||||
#define Chi_21 UChi_01
|
||||
#define Chi_22 UChi_02
|
||||
#define Chi_30 UChi_10
|
||||
#define Chi_31 UChi_11
|
||||
#define Chi_32 UChi_12
|
||||
|
||||
/*********************************************************
|
||||
* Architectural macros
|
||||
*********************************************************/
|
||||
#define HASHit(A) #A
|
||||
#define HASH(A) HASHit(A)
|
||||
#define LOAD64(A,ptr)
|
||||
|
||||
|
||||
#define MASK_REGS /*NOOP ON BGQ*/
|
||||
#define PF_GAUGE(A) /*NOOP ON BGQ*/
|
||||
#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
|
||||
#define PREFETCH_CHIMU(base) /*NOOP ON BGQ*/
|
||||
|
||||
#define VLOADf(OFF,PTR,DEST) "qvlfsx " #DEST "," #PTR "," #OFF " ;\n"
|
||||
#define VLOADuf(OFF,PTR,DEST) "qvlfsux " #DEST "," #PTR "," #OFF " ;\n"
|
||||
#define VSTOREf(OFF,PTR,SRC) "qvstfsx " #SRC "," #PTR "," #OFF " ;\n"
|
||||
#define VSTOREuf(OFF,PTR,SRC) "qvstfsux " #SRC "," #PTR "," #OFF " ;\n"
|
||||
#define VSPLATf(A,B,DEST) "qvlfcsxa " #DEST "," #A "," #B ";\n"
|
||||
#define VSIZEf (16)
|
||||
|
||||
#define VPERMIi(p) "qvgpci " #p ", 1217;\n"
|
||||
#define VPERMi(A,p) "qvfperm " #A "," #A "," #A "," #p ";\n"
|
||||
#define VPERMI(p) VPERMIi(p)
|
||||
#define VPERM(A,p) VPERMi(A,p)
|
||||
|
||||
#define VLOADd(OFF,PTR,DEST) "qvlfdx " #DEST "," #PTR "," #OFF " ;\n"
|
||||
#define VLOADud(OFF,PTR,DEST) "qvlfdux " #DEST "," #PTR "," #OFF " ;\n"
|
||||
#define VSTOREd(OFF,PTR,SRC) "qvstfdx " #SRC "," #PTR "," #OFF " ;\n"
|
||||
#define VSTOREud(OFF,PTR,SRC) "qvstfdux " #SRC "," #PTR "," #OFF " ;\n"
|
||||
#define VSPLATd(A,B,DEST) "qvlfcdxa " #DEST "," #A "," #B ";\n"
|
||||
#define VSIZEd (32)
|
||||
|
||||
// QPX manual ordering QRT comes first (dest)
|
||||
#define VZEROi(DEST) "qvfset " #DEST "; \n qvfsub " #DEST "," #DEST "," #DEST ";\n"
|
||||
#define VONEi(DEST) "qvfset " #DEST "; \n"
|
||||
#define VMOVi(DEST,A) "qvfmr " #DEST "," #A ";\n"
|
||||
#define VADDi(DEST,A,B) "qvfadd " #DEST "," #A "," #B ";\n"
|
||||
#define VSUBi(DEST,A,B) "qvfsub " #DEST "," #A "," #B ";\n"
|
||||
#define VMULi(DEST,A,B) "qvfmul " #DEST "," #A "," #B ";\n"
|
||||
#define VMUL_RR_RIi(DEST,A,B) "qvfxmul " #DEST "," #A "," #B ";\n"
|
||||
#define VMADDi(DEST,A,B,C) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
#define VMADD_RR_RIi(DEST,A,B,C) "qvfxmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
#define VMADD_MII_IRi(DEST,A,B,C) "qvfxxnpmadd " #DEST "," #B "," #A ","#C ";\n"
|
||||
#define VMADD_II_MIRi(DEST,A,B,C) "qvfxxcpnmadd " #DEST "," #B "," #A ","#C ";\n"
|
||||
|
||||
#define VZERO(C) VZEROi(C)
|
||||
#define VONE(C) VONEi(C)
|
||||
#define VMOV(C,A) VMOVi(C,A)
|
||||
#define VADD(A,B,C) VADDi(A,B,C)
|
||||
#define VSUB(A,B,C) VSUBi(A,B,C)
|
||||
#define VMUL(A,B,C) VMULi(A,B,C)
|
||||
#define VMUL_RR_RI(A,B,C) VMUL_RR_RIi(A,B,C)
|
||||
#define VMADD(A,B,C,D) VMADDi(A,B,C,D)
|
||||
#define VMADD_RR_RI(A,B,C,D) VMADD_RR_RIi(A,B,C,D)
|
||||
#define VMADD_MII_IR(A,B,C,D) VMADD_MII_IRi(A,B,C,D)
|
||||
#define VMADD_II_MIR(A,B,C,D) VMADD_II_MIRi(A,B,C,D)
|
||||
|
||||
/*********************************************************
|
||||
* Macro sequences encoding QCD
|
||||
*********************************************************/
|
||||
#define LOCK_GAUGE(dir) \
|
||||
{ \
|
||||
uint64_t byte_addr = (uint64_t)&U._odata[sU]; \
|
||||
int count = (sizeof(U._odata[0])+63)/64; \
|
||||
asm (" mtctr %0 \n" \
|
||||
" mr " HASH(REP) ", %1\n" \
|
||||
" li " HASH(IMM) ", 64\n" \
|
||||
"0:\n" \
|
||||
LOCK_SET \
|
||||
" add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \
|
||||
" bdnz 0b\n" \
|
||||
: : "b" (count), "b" (byte_addr) ); \
|
||||
}
|
||||
|
||||
#define UNLOCK_GAUGE(dir) \
|
||||
{ \
|
||||
uint64_t byte_addr = (uint64_t)&U._odata[sU]; \
|
||||
int count = (sizeof(U._odata[0])+63)/64; \
|
||||
asm (" mtctr %0 \n" \
|
||||
" mr " HASH(REP) ", %1\n" \
|
||||
" li " HASH(IMM) ", 64\n" \
|
||||
"0:\n" \
|
||||
LOCK_CLEAR \
|
||||
" add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \
|
||||
" bdnz 0b\n" \
|
||||
: : "b" (count), "b" (byte_addr) ); \
|
||||
}
|
||||
|
||||
#define ZERO_PSI \
|
||||
VZERO(psi_00) \
|
||||
VZERO(psi_01) \
|
||||
VZERO(psi_02) \
|
||||
VZERO(psi_10) \
|
||||
VZERO(psi_11) \
|
||||
VZERO(psi_12) \
|
||||
VZERO(psi_20) \
|
||||
VZERO(psi_21) \
|
||||
VZERO(psi_22) \
|
||||
VZERO(psi_30) \
|
||||
VZERO(psi_31) \
|
||||
VZERO(psi_32)
|
||||
|
||||
#define MULT_2SPIN_QPX_LSd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,16)
|
||||
#define MULT_2SPIN_QPX_LSf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,8)
|
||||
#define MULT_2SPIN_QPXd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,32)
|
||||
#define MULT_2SPIN_QPXf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,16)
|
||||
|
||||
#define MULT_2SPIN_QPX_INTERNAL(ptr,p,ULOAD,USKIP) { \
|
||||
uint64_t ub = ((uint64_t)ptr); \
|
||||
asm ( \
|
||||
ULOAD(%0,%3,U0) \
|
||||
ULOAD(%1,%3,U1) \
|
||||
ULOAD(%2,%3,U2) \
|
||||
VMUL_RR_RI(UChi_00,U0,Chi_00) \
|
||||
VMUL_RR_RI(UChi_01,U1,Chi_00) \
|
||||
VMUL_RR_RI(UChi_02,U2,Chi_00) \
|
||||
VMUL_RR_RI(UChi_10,U0,Chi_10) \
|
||||
VMUL_RR_RI(UChi_11,U1,Chi_10) \
|
||||
VMUL_RR_RI(UChi_12,U2,Chi_10) \
|
||||
VMADD_MII_IR(UChi_00,U0,Chi_00,UChi_00) \
|
||||
VMADD_MII_IR(UChi_01,U1,Chi_00,UChi_01) \
|
||||
VMADD_MII_IR(UChi_02,U2,Chi_00,UChi_02) \
|
||||
VMADD_MII_IR(UChi_10,U0,Chi_10,UChi_10) \
|
||||
VMADD_MII_IR(UChi_11,U1,Chi_10,UChi_11) \
|
||||
VMADD_MII_IR(UChi_12,U2,Chi_10,UChi_12) \
|
||||
: : "b" (0), "b" (USKIP*3), "b" (USKIP*6), "b" (ub )); \
|
||||
asm ( \
|
||||
ULOAD(%0,%3,U0) \
|
||||
ULOAD(%1,%3,U1) \
|
||||
ULOAD(%2,%3,U2) \
|
||||
VMADD_RR_RI(UChi_00,U0,Chi_01,UChi_00) \
|
||||
VMADD_RR_RI(UChi_01,U1,Chi_01,UChi_01) \
|
||||
VMADD_RR_RI(UChi_02,U2,Chi_01,UChi_02) \
|
||||
VMADD_RR_RI(UChi_10,U0,Chi_11,UChi_10) \
|
||||
VMADD_RR_RI(UChi_11,U1,Chi_11,UChi_11) \
|
||||
VMADD_RR_RI(UChi_12,U2,Chi_11,UChi_12) \
|
||||
VMADD_MII_IR(UChi_00,U0,Chi_01,UChi_00) \
|
||||
VMADD_MII_IR(UChi_01,U1,Chi_01,UChi_01) \
|
||||
VMADD_MII_IR(UChi_02,U2,Chi_01,UChi_02) \
|
||||
VMADD_MII_IR(UChi_10,U0,Chi_11,UChi_10) \
|
||||
VMADD_MII_IR(UChi_11,U1,Chi_11,UChi_11) \
|
||||
VMADD_MII_IR(UChi_12,U2,Chi_11,UChi_12) \
|
||||
: : "b" (USKIP*1), "b" (USKIP*4), "b" (USKIP*7), "b" (ub )); \
|
||||
asm ( \
|
||||
ULOAD(%0,%3,U0) \
|
||||
ULOAD(%1,%3,U1) \
|
||||
ULOAD(%2,%3,U2) \
|
||||
VMADD_RR_RI(UChi_00,U0,Chi_02,UChi_00) \
|
||||
VMADD_RR_RI(UChi_01,U1,Chi_02,UChi_01) \
|
||||
VMADD_RR_RI(UChi_02,U2,Chi_02,UChi_02) \
|
||||
VMADD_RR_RI(UChi_10,U0,Chi_12,UChi_10) \
|
||||
VMADD_RR_RI(UChi_11,U1,Chi_12,UChi_11) \
|
||||
VMADD_RR_RI(UChi_12,U2,Chi_12,UChi_12) \
|
||||
VMADD_MII_IR(UChi_00,U0,Chi_02,UChi_00) \
|
||||
VMADD_MII_IR(UChi_01,U1,Chi_02,UChi_01) \
|
||||
VMADD_MII_IR(UChi_02,U2,Chi_02,UChi_02) \
|
||||
VMADD_MII_IR(UChi_10,U0,Chi_12,UChi_10) \
|
||||
VMADD_MII_IR(UChi_11,U1,Chi_12,UChi_11) \
|
||||
VMADD_MII_IR(UChi_12,U2,Chi_12,UChi_12) \
|
||||
: : "b" (USKIP*2), "b" (USKIP*5), "b" (USKIP*8), "b" (ub )); \
|
||||
}
|
||||
|
||||
|
||||
#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
|
||||
#define SAVE_RESULT(base,basep) {\
|
||||
uint64_t ub = ((uint64_t)base) - (VSIZE); \
|
||||
asm("mr " HASH(REP) ", %0;\n" \
|
||||
"li " HASH(IMM) "," HASH(VSIZE)" ;\n" \
|
||||
VSTOREu(IMM,REP,psi_00) \
|
||||
VSTOREu(IMM,REP,psi_01) \
|
||||
VSTOREu(IMM,REP,psi_02) \
|
||||
VSTOREu(IMM,REP,psi_10) \
|
||||
VSTOREu(IMM,REP,psi_11) \
|
||||
VSTOREu(IMM,REP,psi_12) \
|
||||
VSTOREu(IMM,REP,psi_20) \
|
||||
VSTOREu(IMM,REP,psi_21) \
|
||||
VSTOREu(IMM,REP,psi_22) \
|
||||
VSTOREu(IMM,REP,psi_30) \
|
||||
VSTOREu(IMM,REP,psi_31) \
|
||||
VSTOREu(IMM,REP,psi_32) \
|
||||
: : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
*Annoying BG/Q loads with no immediat indexing and big performance hit
|
||||
*when second miss to a L1 line occurs
|
||||
*/
|
||||
#define LOAD_CHI(base) { \
|
||||
uint64_t ub = ((uint64_t)base) - (2*VSIZE); \
|
||||
asm("mr " HASH(REP) ",%0 ;\n" \
|
||||
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
|
||||
VLOADu(IMM,REP,Chi_00) \
|
||||
VLOADu(IMM,REP,Chi_02) \
|
||||
VLOADu(IMM,REP,Chi_11) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
ub = ((uint64_t)base) - VSIZE; \
|
||||
asm("mr " HASH(REP) ", %0;\n" \
|
||||
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
|
||||
VLOADu(IMM,REP,Chi_01) \
|
||||
VLOADu(IMM,REP,Chi_10) \
|
||||
VLOADu(IMM,REP,Chi_12) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
}
|
||||
|
||||
#define LOAD_CHIMU(base) { \
|
||||
uint64_t ub = ((uint64_t)base) - (2*VSIZE); \
|
||||
asm("mr " HASH(REP) ",%0;\n" \
|
||||
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
|
||||
VLOADu(IMM,REP,Chi_00) \
|
||||
VLOADu(IMM,REP,Chi_02) \
|
||||
VLOADu(IMM,REP,Chi_11) \
|
||||
VLOADu(IMM,REP,Chi_20) \
|
||||
VLOADu(IMM,REP,Chi_22) \
|
||||
VLOADu(IMM,REP,Chi_31) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
ub = ((uint64_t)base) - VSIZE; \
|
||||
asm("mr " HASH(REP) ", %0;\n" \
|
||||
"li " HASH(IMM) ", (2*" HASH(VSIZE) ");\n" \
|
||||
VLOADu(IMM,REP,Chi_01) \
|
||||
VLOADu(IMM,REP,Chi_10) \
|
||||
VLOADu(IMM,REP,Chi_12) \
|
||||
VLOADu(IMM,REP,Chi_21) \
|
||||
VLOADu(IMM,REP,Chi_30) \
|
||||
VLOADu(IMM,REP,Chi_32) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
}
|
||||
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
#define XP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_MII_IR(Chi_00,one,Chi_30,Chi_00) \
|
||||
VMADD_MII_IR(Chi_01,one,Chi_31,Chi_01) \
|
||||
VMADD_MII_IR(Chi_02,one,Chi_32,Chi_02) \
|
||||
VMADD_MII_IR(Chi_10,one,Chi_20,Chi_10) \
|
||||
VMADD_MII_IR(Chi_11,one,Chi_21,Chi_11) \
|
||||
VMADD_MII_IR(Chi_12,one,Chi_22,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_II_MIR(Chi_00,one,Chi_30,Chi_00) \
|
||||
VMADD_II_MIR(Chi_01,one,Chi_31,Chi_01) \
|
||||
VMADD_II_MIR(Chi_02,one,Chi_32,Chi_02) \
|
||||
VMADD_II_MIR(Chi_10,one,Chi_20,Chi_10) \
|
||||
VMADD_II_MIR(Chi_11,one,Chi_21,Chi_11) \
|
||||
VMADD_II_MIR(Chi_12,one,Chi_22,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
// hspin(0)=fspin(0)-fspin(3);
|
||||
// hspin(1)=fspin(1)+fspin(2);
|
||||
#define YP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VSUB(Chi_00,Chi_00,Chi_30) \
|
||||
VSUB(Chi_01,Chi_01,Chi_31) \
|
||||
VSUB(Chi_02,Chi_02,Chi_32) \
|
||||
VADD(Chi_10,Chi_10,Chi_20) \
|
||||
VADD(Chi_11,Chi_11,Chi_21) \
|
||||
VADD(Chi_12,Chi_12,Chi_22) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define YM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VADD(Chi_00,Chi_00,Chi_30) \
|
||||
VADD(Chi_01,Chi_01,Chi_31) \
|
||||
VADD(Chi_02,Chi_02,Chi_32) \
|
||||
VSUB(Chi_10,Chi_10,Chi_20) \
|
||||
VSUB(Chi_11,Chi_11,Chi_21) \
|
||||
VSUB(Chi_12,Chi_12,Chi_22) ); \
|
||||
}
|
||||
|
||||
/*Gz
|
||||
* 0 0 i 0 [0]+-i[2]
|
||||
* 0 0 0 -i [1]-+i[3]
|
||||
* -i 0 0 0
|
||||
* 0 i 0 0
|
||||
*/
|
||||
#define ZP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_MII_IR(Chi_00,one,Chi_20,Chi_00) \
|
||||
VMADD_MII_IR(Chi_01,one,Chi_21,Chi_01) \
|
||||
VMADD_MII_IR(Chi_02,one,Chi_22,Chi_02) \
|
||||
VMADD_II_MIR(Chi_10,one,Chi_30,Chi_10) \
|
||||
VMADD_II_MIR(Chi_11,one,Chi_31,Chi_11) \
|
||||
VMADD_II_MIR(Chi_12,one,Chi_32,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define ZM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_II_MIR(Chi_00,one,Chi_20,Chi_00) \
|
||||
VMADD_II_MIR(Chi_01,one,Chi_21,Chi_01) \
|
||||
VMADD_II_MIR(Chi_02,one,Chi_22,Chi_02) \
|
||||
VMADD_MII_IR(Chi_10,one,Chi_30,Chi_10) \
|
||||
VMADD_MII_IR(Chi_11,one,Chi_31,Chi_11) \
|
||||
VMADD_MII_IR(Chi_12,one,Chi_32,Chi_12) \
|
||||
); \
|
||||
}
|
||||
/*Gt
|
||||
* 0 0 1 0 [0]+-[2]
|
||||
* 0 0 0 1 [1]+-[3]
|
||||
* 1 0 0 0
|
||||
* 0 1 0 0
|
||||
*/
|
||||
#define TP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VADD(Chi_00,Chi_00,Chi_20) \
|
||||
VADD(Chi_01,Chi_01,Chi_21) \
|
||||
VADD(Chi_02,Chi_02,Chi_22) \
|
||||
VADD(Chi_10,Chi_10,Chi_30) \
|
||||
VADD(Chi_11,Chi_11,Chi_31) \
|
||||
VADD(Chi_12,Chi_12,Chi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define TM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VSUB(Chi_00,Chi_00,Chi_20) \
|
||||
VSUB(Chi_01,Chi_01,Chi_21) \
|
||||
VSUB(Chi_02,Chi_02,Chi_22) \
|
||||
VSUB(Chi_10,Chi_10,Chi_30) \
|
||||
VSUB(Chi_11,Chi_11,Chi_31) \
|
||||
VSUB(Chi_12,Chi_12,Chi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
/*
|
||||
fspin(0)=hspin(0);
|
||||
fspin(1)=hspin(1);
|
||||
fspin(2)=timesMinusI(hspin(1));
|
||||
fspin(3)=timesMinusI(hspin(0));
|
||||
|
||||
fspin(0)+=hspin(0);
|
||||
fspin(1)+=hspin(1);
|
||||
fspin(2)-=timesI(hspin(1));
|
||||
fspin(3)-=timesI(hspin(0));
|
||||
*/
|
||||
#define XP_RECON { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
|
||||
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
|
||||
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
|
||||
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
|
||||
VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \
|
||||
VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \
|
||||
VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \
|
||||
VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \
|
||||
VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \
|
||||
VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_RECON { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
|
||||
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
|
||||
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
|
||||
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
|
||||
VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \
|
||||
VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \
|
||||
VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \
|
||||
VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \
|
||||
VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \
|
||||
VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XP_RECON_ACCUM { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \
|
||||
VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \
|
||||
VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \
|
||||
VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \
|
||||
VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \
|
||||
VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_RECON_ACCUM { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \
|
||||
VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \
|
||||
VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \
|
||||
VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \
|
||||
VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \
|
||||
VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
// fspin(2)+=hspin(1);
|
||||
// fspin(3)-=hspin(0);
|
||||
#define YP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VADD(psi_20,psi_20,UChi_10) VADD(psi_21,psi_21,UChi_11) VADD(psi_22,psi_22,UChi_12) \
|
||||
VSUB(psi_30,psi_30,UChi_00) VSUB(psi_31,psi_31,UChi_01) VSUB(psi_32,psi_32,UChi_02) \
|
||||
);\
|
||||
}
|
||||
#define YM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VSUB(psi_20,psi_20,UChi_10) VSUB(psi_21,psi_21,UChi_11) VSUB(psi_22,psi_22,UChi_12) \
|
||||
VADD(psi_30,psi_30,UChi_00) VADD(psi_31,psi_31,UChi_01) VADD(psi_32,psi_32,UChi_02) \
|
||||
);\
|
||||
}
|
||||
|
||||
// fspin(2)-=timesI(hspin(0));
|
||||
// fspin(3)+=timesI(hspin(1));
|
||||
#define ZP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VMADD_II_MIR(psi_20,one,UChi_00,psi_20) \
|
||||
VMADD_II_MIR(psi_21,one,UChi_01,psi_21) \
|
||||
VMADD_II_MIR(psi_22,one,UChi_02,psi_22) \
|
||||
VMADD_MII_IR(psi_30,one,UChi_10,psi_30) \
|
||||
VMADD_MII_IR(psi_31,one,UChi_11,psi_31) \
|
||||
VMADD_MII_IR(psi_32,one,UChi_12,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
#define ZM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VMADD_MII_IR(psi_20,one,UChi_00,psi_20) \
|
||||
VMADD_MII_IR(psi_21,one,UChi_01,psi_21) \
|
||||
VMADD_MII_IR(psi_22,one,UChi_02,psi_22) \
|
||||
VMADD_II_MIR(psi_30,one,UChi_10,psi_30) \
|
||||
VMADD_II_MIR(psi_31,one,UChi_11,psi_31) \
|
||||
VMADD_II_MIR(psi_32,one,UChi_12,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
// fspin(2)+=hspin(0);
|
||||
// fspin(3)+=hspin(1);
|
||||
#define TP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VADD(psi_20,psi_20,UChi_00) VADD(psi_21,psi_21,UChi_01) VADD(psi_22,psi_22,UChi_02) \
|
||||
VADD(psi_30,psi_30,UChi_10) VADD(psi_31,psi_31,UChi_11) VADD(psi_32,psi_32,UChi_12) \
|
||||
);\
|
||||
}
|
||||
|
||||
#define TM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VSUB(psi_20,psi_20,UChi_00) VSUB(psi_21,psi_21,UChi_01) VSUB(psi_22,psi_22,UChi_02) \
|
||||
VSUB(psi_30,psi_30,UChi_10) VSUB(psi_31,psi_31,UChi_11) VSUB(psi_32,psi_32,UChi_12) \
|
||||
);\
|
||||
}
|
||||
|
||||
|
||||
#define ADD_RESULTi(PTR,pf) \
|
||||
LOAD_CHIMU(PTR) \
|
||||
asm( \
|
||||
VADD(psi_00,chi_00,psi_00) VADD(psi_01,chi_01,psi_01) VADD(psi_02,chi_02,psi_02) \
|
||||
VADD(psi_10,chi_10,psi_10) VADD(psi_11,chi_11,psi_11) VADD(psi_12,chi_12,psi_12) \
|
||||
VADD(psi_20,chi_20,psi_20) VADD(psi_21,chi_21,psi_21) VADD(psi_22,chi_22,psi_22) \
|
||||
VADD(psi_30,chi_30,psi_30) VADD(psi_31,chi_31,psi_31) VADD(psi_32,chi_32,psi_32) ); \
|
||||
SAVE_RESULT(PTR,pf);
|
||||
|
||||
|
||||
#define PERMUTE_DIR3
|
||||
#define PERMUTE_DIR2
|
||||
#define PERMUTE_DIR1
|
||||
|
||||
#define PERMUTE_DIR0 { \
|
||||
asm( \
|
||||
VPERMI(perm_reg) \
|
||||
VPERM(Chi_00,perm_reg) VPERM(Chi_01,perm_reg) VPERM(Chi_02,perm_reg) \
|
||||
VPERM(Chi_10,perm_reg) VPERM(Chi_11,perm_reg) VPERM(Chi_12,perm_reg) ); \
|
||||
}
|
||||
|
||||
#endif
|
46
lib/simd/IBM_qpx_double.h
Normal file
46
lib/simd/IBM_qpx_double.h
Normal file
@ -0,0 +1,46 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
// No guard; ok multi-include
|
||||
#undef VSIZE
|
||||
#undef VLOAD
|
||||
#undef VLOADu
|
||||
#undef VSPLAT
|
||||
#undef VSTORE
|
||||
#undef VSTOREu
|
||||
#undef MULT_2SPIN_QPX_LS
|
||||
#undef MULT_2SPIN_QPX
|
||||
|
||||
#define VSIZE VSIZEd
|
||||
#define VLOAD(A,B,C) VLOADd(A,B,C)
|
||||
#define VLOADu(A,B,C) VLOADud(A,B,C)
|
||||
#define VSPLAT(A,B,DEST) VSPLATd(A,B,DEST)
|
||||
#define VSTORE(A,B,C) VSTOREd(A,B,C)
|
||||
#define VSTOREu(A,B,C) VSTOREud(A,B,C)
|
||||
#define MULT_2SPIN_QPX_LS(ptr,p) MULT_2SPIN_QPX_LSd(ptr,p)
|
||||
#define MULT_2SPIN_QPX(ptr,p) MULT_2SPIN_QPXd(ptr,p)
|
||||
|
46
lib/simd/IBM_qpx_single.h
Normal file
46
lib/simd/IBM_qpx_single.h
Normal file
@ -0,0 +1,46 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
// No guard; ok multi-include
|
||||
#undef VSIZE
|
||||
#undef VLOAD
|
||||
#undef VLOADu
|
||||
#undef VSPLAT
|
||||
#undef VSTORE
|
||||
#undef VSTOREu
|
||||
#undef MULT_2SPIN_QPX_LS
|
||||
#undef MULT_2SPIN_QPX
|
||||
|
||||
#define VSIZE VSIZEf
|
||||
#define VLOAD(A,B,C) VLOADf(A,B,C)
|
||||
#define VLOADu(A,B,C) VLOADuf(A,B,C)
|
||||
#define VSPLAT(A,B,DEST) VSPLATf(A,B,DEST)
|
||||
#define VSTORE(A,B,C) VSTOREf(A,B,C)
|
||||
#define VSTOREu(A,B,C) VSTOREuf(A,B,C)
|
||||
#define MULT_2SPIN_QPX_LS(ptr,p) MULT_2SPIN_QPX_LSf(ptr,p)
|
||||
#define MULT_2SPIN_QPX(ptr,p) MULT_2SPIN_QPXf(ptr,p)
|
||||
|
@ -31,21 +31,21 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Register allocations for Wilson Kernel are precision indept
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
#define result_00 %zmm0
|
||||
#define result_01 %zmm1
|
||||
#define result_02 %zmm2
|
||||
#define psi_00 %zmm0
|
||||
#define psi_01 %zmm1
|
||||
#define psi_02 %zmm2
|
||||
|
||||
#define result_10 %zmm3
|
||||
#define result_11 %zmm4
|
||||
#define result_12 %zmm5
|
||||
#define psi_10 %zmm3
|
||||
#define psi_11 %zmm4
|
||||
#define psi_12 %zmm5
|
||||
|
||||
#define result_20 %zmm6
|
||||
#define result_21 %zmm7
|
||||
#define result_22 %zmm8
|
||||
#define psi_20 %zmm6
|
||||
#define psi_21 %zmm7
|
||||
#define psi_22 %zmm8
|
||||
|
||||
#define result_30 %zmm9
|
||||
#define result_31 %zmm10
|
||||
#define result_32 %zmm11
|
||||
#define psi_30 %zmm9
|
||||
#define psi_31 %zmm10
|
||||
#define psi_32 %zmm11
|
||||
|
||||
#define Chi_00 %zmm12
|
||||
#define Chi_01 %zmm13
|
||||
@ -98,34 +98,50 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
// a little as some duplication developed during trying different
|
||||
// variants during optimisation. Could cut back to only those used.
|
||||
//////////////////////////////////////////////////////////////////
|
||||
#define LOCK_GAUGE(dir)
|
||||
#define UNLOCK_GAUGE(dir)
|
||||
|
||||
// const SiteSpinor * ptr = & in._odata[offset];
|
||||
#define LOAD_CHIMU(PTR) LOAD_CHIMUi(PTR)
|
||||
#define LOAD_CHIMU(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIMUi );
|
||||
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
|
||||
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
|
||||
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
|
||||
#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
|
||||
#define ADD_RESULT(PT,R) ADD_RESULTi(PT,R)
|
||||
|
||||
#define LOAD_CHIMUi \
|
||||
LOAD_CHIMU01i \
|
||||
LOAD_CHIMU23i );
|
||||
#define ZERO_PSI \
|
||||
asm( VZERO(psi_00) \
|
||||
VZERO(psi_01) \
|
||||
VZERO(psi_02) \
|
||||
VZERO(psi_10) \
|
||||
VZERO(psi_11) \
|
||||
VZERO(psi_12) \
|
||||
VZERO(psi_20) \
|
||||
VZERO(psi_21) \
|
||||
VZERO(psi_22) \
|
||||
VZERO(psi_30) \
|
||||
VZERO(psi_31) \
|
||||
VZERO(psi_32));
|
||||
|
||||
#define LOAD_CHIMUi \
|
||||
LOAD_CHIMU01i \
|
||||
LOAD_CHIMU23i
|
||||
|
||||
#define LOAD_CHIMU01i\
|
||||
VLOAD(0,%r8,Chimu_00) \
|
||||
VLOAD(1,%r8,Chimu_01) \
|
||||
VLOAD(2,%r8,Chimu_02) \
|
||||
VLOAD(3,%r8,Chimu_10) \
|
||||
VLOAD(4,%r8,Chimu_11) \
|
||||
VLOAD(5,%r8,Chimu_12)
|
||||
#define LOAD_CHIMU01i \
|
||||
VLOAD(0,%r8,Chimu_00) \
|
||||
VLOAD(1,%r8,Chimu_01) \
|
||||
VLOAD(2,%r8,Chimu_02) \
|
||||
VLOAD(3,%r8,Chimu_10) \
|
||||
VLOAD(4,%r8,Chimu_11) \
|
||||
VLOAD(5,%r8,Chimu_12)
|
||||
|
||||
#define LOAD_CHIMU23i\
|
||||
VLOAD(6,%r8,Chimu_20) \
|
||||
VLOAD(7,%r8,Chimu_21) \
|
||||
VLOAD(8,%r8,Chimu_22) \
|
||||
VLOAD(9,%r8,Chimu_30) \
|
||||
VLOAD(10,%r8,Chimu_31) \
|
||||
VLOAD(11,%r8,Chimu_32)
|
||||
#define LOAD_CHIMU23i \
|
||||
VLOAD(6,%r8,Chimu_20) \
|
||||
VLOAD(7,%r8,Chimu_21) \
|
||||
VLOAD(8,%r8,Chimu_22) \
|
||||
VLOAD(9,%r8,Chimu_30) \
|
||||
VLOAD(10,%r8,Chimu_31) \
|
||||
VLOAD(11,%r8,Chimu_32)
|
||||
|
||||
#define SHUF_CHIMU23i\
|
||||
VSHUFMEM(6,%r8,Chimu_20) \
|
||||
@ -135,9 +151,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VSHUFMEM(10,%r8,Chimu_31) \
|
||||
VSHUFMEM(11,%r8,Chimu_32)
|
||||
|
||||
|
||||
// const SiteHalfSpinor *ptr = &buf[offset];
|
||||
|
||||
#define LOAD_CHIi \
|
||||
VLOAD(0,%r8,Chi_00) \
|
||||
VLOAD(1,%r8,Chi_01) \
|
||||
@ -145,7 +158,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VLOAD(3,%r8,Chi_10) \
|
||||
VLOAD(4,%r8,Chi_11) \
|
||||
VLOAD(5,%r8,Chi_12)
|
||||
|
||||
|
||||
#define SAVE_UCHIi(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
@ -155,8 +167,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VSTORE(2,%r8,UChi_02) \
|
||||
VSTORE(3,%r8,UChi_10) \
|
||||
VSTORE(4,%r8,UChi_11) \
|
||||
VSTORE(5,%r8,UChi_12) \
|
||||
);
|
||||
VSTORE(5,%r8,UChi_12) );
|
||||
|
||||
#define SAVE_CHIi(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
@ -166,33 +177,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VSTORE(2,%r8,Chi_02) \
|
||||
VSTORE(3,%r8,Chi_10) \
|
||||
VSTORE(4,%r8,Chi_11) \
|
||||
VSTORE(5,%r8,Chi_12) \
|
||||
);
|
||||
VSTORE(5,%r8,Chi_12) );
|
||||
|
||||
|
||||
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN_PFTP(&U._odata[sU](A),p)
|
||||
|
||||
#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN_PFXM(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN_PFYM(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
|
||||
|
||||
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Dirac algebra
|
||||
//////////////////////////////////////////////////////////////////
|
||||
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
#define XP_PROJMEM(PTR) \
|
||||
@ -257,7 +249,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
// hspin(0)=fspin(0)-timesI(fspin(3))
|
||||
// hspin(1)=fspin(1)-timesI(fspin(2))
|
||||
|
||||
#define XM_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR)\
|
||||
__asm__ ( \
|
||||
@ -322,226 +313,226 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
// fspin(3)=timesMinusI(hspin(0))
|
||||
#define XP_RECON __asm__ ( \
|
||||
VZERO(TMP) \
|
||||
VTIMESMINUSI0(UChi_00,result_30,TMP) \
|
||||
VTIMESMINUSI0(UChi_10,result_20,TMP) \
|
||||
VTIMESMINUSI0(UChi_01,result_31,TMP) \
|
||||
VTIMESMINUSI0(UChi_11,result_21,TMP) \
|
||||
VTIMESMINUSI0(UChi_02,result_32,TMP) \
|
||||
VTIMESMINUSI0(UChi_12,result_22,TMP) \
|
||||
VMOV(UChi_00,result_00) \
|
||||
VMOV(UChi_10,result_10) \
|
||||
VMOV(UChi_01,result_01) \
|
||||
VMOV(UChi_11,result_11) \
|
||||
VMOV(UChi_02,result_02) \
|
||||
VMOV(UChi_12,result_12) \
|
||||
VTIMESMINUSI1(UChi_10,result_20,TMP) \
|
||||
VTIMESMINUSI1(UChi_11,result_21,TMP) \
|
||||
VTIMESMINUSI1(UChi_12,result_22,TMP) \
|
||||
VTIMESMINUSI1(UChi_00,result_30,TMP) \
|
||||
VTIMESMINUSI1(UChi_01,result_31,TMP) \
|
||||
VTIMESMINUSI1(UChi_02,result_32,TMP) \
|
||||
VTIMESMINUSI2(UChi_10,result_20,TMP) \
|
||||
VTIMESMINUSI2(UChi_11,result_21,TMP) \
|
||||
VTIMESMINUSI2(UChi_12,result_22,TMP) \
|
||||
VTIMESMINUSI2(UChi_00,result_30,TMP) \
|
||||
VTIMESMINUSI2(UChi_01,result_31,TMP) \
|
||||
VTIMESMINUSI2(UChi_02,result_32,TMP) \
|
||||
VTIMESMINUSI0(UChi_00,psi_30,TMP) \
|
||||
VTIMESMINUSI0(UChi_10,psi_20,TMP) \
|
||||
VTIMESMINUSI0(UChi_01,psi_31,TMP) \
|
||||
VTIMESMINUSI0(UChi_11,psi_21,TMP) \
|
||||
VTIMESMINUSI0(UChi_02,psi_32,TMP) \
|
||||
VTIMESMINUSI0(UChi_12,psi_22,TMP) \
|
||||
VMOV(UChi_00,psi_00) \
|
||||
VMOV(UChi_10,psi_10) \
|
||||
VMOV(UChi_01,psi_01) \
|
||||
VMOV(UChi_11,psi_11) \
|
||||
VMOV(UChi_02,psi_02) \
|
||||
VMOV(UChi_12,psi_12) \
|
||||
VTIMESMINUSI1(UChi_10,psi_20,TMP) \
|
||||
VTIMESMINUSI1(UChi_11,psi_21,TMP) \
|
||||
VTIMESMINUSI1(UChi_12,psi_22,TMP) \
|
||||
VTIMESMINUSI1(UChi_00,psi_30,TMP) \
|
||||
VTIMESMINUSI1(UChi_01,psi_31,TMP) \
|
||||
VTIMESMINUSI1(UChi_02,psi_32,TMP) \
|
||||
VTIMESMINUSI2(UChi_10,psi_20,TMP) \
|
||||
VTIMESMINUSI2(UChi_11,psi_21,TMP) \
|
||||
VTIMESMINUSI2(UChi_12,psi_22,TMP) \
|
||||
VTIMESMINUSI2(UChi_00,psi_30,TMP) \
|
||||
VTIMESMINUSI2(UChi_01,psi_31,TMP) \
|
||||
VTIMESMINUSI2(UChi_02,psi_32,TMP) \
|
||||
);
|
||||
// NB could save 6 ops using addsub => 12 cycles
|
||||
#define XP_RECON_ACCUM __asm__ ( \
|
||||
VZERO(TMP)\
|
||||
VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
|
||||
VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
|
||||
VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
|
||||
VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
|
||||
VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
|
||||
VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
|
||||
VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
|
||||
VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
|
||||
VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
|
||||
VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
|
||||
VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
|
||||
VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
|
||||
VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_00,result_30,Z3)\
|
||||
VACCTIMESMINUSI2(UChi_01,result_31,Z4)\
|
||||
VACCTIMESMINUSI2(UChi_02,result_32,Z5)\
|
||||
VACCTIMESMINUSI0(UChi_00,psi_30,Z3)\
|
||||
VACCTIMESMINUSI0(UChi_10,psi_20,Z0)\
|
||||
VACCTIMESMINUSI0(UChi_01,psi_31,Z4)\
|
||||
VACCTIMESMINUSI0(UChi_11,psi_21,Z1)\
|
||||
VACCTIMESMINUSI0(UChi_02,psi_32,Z5)\
|
||||
VACCTIMESMINUSI0(UChi_12,psi_22,Z2)\
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VACCTIMESMINUSI1(UChi_00,psi_30,Z3)\
|
||||
VACCTIMESMINUSI1(UChi_10,psi_20,Z0)\
|
||||
VACCTIMESMINUSI1(UChi_01,psi_31,Z4)\
|
||||
VACCTIMESMINUSI1(UChi_11,psi_21,Z1)\
|
||||
VACCTIMESMINUSI1(UChi_02,psi_32,Z5)\
|
||||
VACCTIMESMINUSI1(UChi_12,psi_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_10,psi_20,Z0)\
|
||||
VACCTIMESMINUSI2(UChi_11,psi_21,Z1)\
|
||||
VACCTIMESMINUSI2(UChi_12,psi_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_00,psi_30,Z3)\
|
||||
VACCTIMESMINUSI2(UChi_01,psi_31,Z4)\
|
||||
VACCTIMESMINUSI2(UChi_02,psi_32,Z5)\
|
||||
);
|
||||
|
||||
#define XM_RECON __asm__ ( \
|
||||
VZERO(TMP)\
|
||||
VTIMESI0(UChi_00,result_30,TMP)\
|
||||
VTIMESI0(UChi_10,result_20,TMP)\
|
||||
VTIMESI0(UChi_01,result_31,TMP)\
|
||||
VTIMESI0(UChi_11,result_21,TMP)\
|
||||
VTIMESI0(UChi_02,result_32,TMP)\
|
||||
VTIMESI0(UChi_12,result_22,TMP)\
|
||||
VMOV(UChi_00,result_00)\
|
||||
VMOV(UChi_10,result_10)\
|
||||
VMOV(UChi_01,result_01)\
|
||||
VMOV(UChi_11,result_11)\
|
||||
VMOV(UChi_02,result_02)\
|
||||
VMOV(UChi_12,result_12)\
|
||||
VTIMESI1(UChi_00,result_30,TMP)\
|
||||
VTIMESI1(UChi_10,result_20,TMP)\
|
||||
VTIMESI1(UChi_01,result_31,TMP)\
|
||||
VTIMESI1(UChi_11,result_21,TMP)\
|
||||
VTIMESI1(UChi_02,result_32,TMP)\
|
||||
VTIMESI1(UChi_12,result_22,TMP)\
|
||||
VTIMESI2(UChi_10,result_20,TMP)\
|
||||
VTIMESI2(UChi_11,result_21,TMP)\
|
||||
VTIMESI2(UChi_12,result_22,TMP)\
|
||||
VTIMESI2(UChi_00,result_30,TMP)\
|
||||
VTIMESI2(UChi_01,result_31,TMP)\
|
||||
VTIMESI2(UChi_02,result_32,TMP)\
|
||||
VTIMESI0(UChi_00,psi_30,TMP)\
|
||||
VTIMESI0(UChi_10,psi_20,TMP)\
|
||||
VTIMESI0(UChi_01,psi_31,TMP)\
|
||||
VTIMESI0(UChi_11,psi_21,TMP)\
|
||||
VTIMESI0(UChi_02,psi_32,TMP)\
|
||||
VTIMESI0(UChi_12,psi_22,TMP)\
|
||||
VMOV(UChi_00,psi_00)\
|
||||
VMOV(UChi_10,psi_10)\
|
||||
VMOV(UChi_01,psi_01)\
|
||||
VMOV(UChi_11,psi_11)\
|
||||
VMOV(UChi_02,psi_02)\
|
||||
VMOV(UChi_12,psi_12)\
|
||||
VTIMESI1(UChi_00,psi_30,TMP)\
|
||||
VTIMESI1(UChi_10,psi_20,TMP)\
|
||||
VTIMESI1(UChi_01,psi_31,TMP)\
|
||||
VTIMESI1(UChi_11,psi_21,TMP)\
|
||||
VTIMESI1(UChi_02,psi_32,TMP)\
|
||||
VTIMESI1(UChi_12,psi_22,TMP)\
|
||||
VTIMESI2(UChi_10,psi_20,TMP)\
|
||||
VTIMESI2(UChi_11,psi_21,TMP)\
|
||||
VTIMESI2(UChi_12,psi_22,TMP)\
|
||||
VTIMESI2(UChi_00,psi_30,TMP)\
|
||||
VTIMESI2(UChi_01,psi_31,TMP)\
|
||||
VTIMESI2(UChi_02,psi_32,TMP)\
|
||||
);
|
||||
|
||||
#define XM_RECON_ACCUM __asm__ ( \
|
||||
VACCTIMESI0(UChi_10,result_20,Z0)\
|
||||
VACCTIMESI0(UChi_00,result_30,Z3)\
|
||||
VACCTIMESI0(UChi_11,result_21,Z1)\
|
||||
VACCTIMESI0(UChi_01,result_31,Z4)\
|
||||
VACCTIMESI0(UChi_12,result_22,Z2)\
|
||||
VACCTIMESI0(UChi_02,result_32,Z5)\
|
||||
VACCTIMESI0(UChi_10,psi_20,Z0)\
|
||||
VACCTIMESI0(UChi_00,psi_30,Z3)\
|
||||
VACCTIMESI0(UChi_11,psi_21,Z1)\
|
||||
VACCTIMESI0(UChi_01,psi_31,Z4)\
|
||||
VACCTIMESI0(UChi_12,psi_22,Z2)\
|
||||
VACCTIMESI0(UChi_02,psi_32,Z5)\
|
||||
\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
\
|
||||
VACCTIMESI1(UChi_10,result_20,Z0)\
|
||||
VACCTIMESI1(UChi_00,result_30,Z3)\
|
||||
VACCTIMESI1(UChi_11,result_21,Z1)\
|
||||
VACCTIMESI1(UChi_01,result_31,Z4)\
|
||||
VACCTIMESI1(UChi_12,result_22,Z2)\
|
||||
VACCTIMESI1(UChi_02,result_32,Z5)\
|
||||
VACCTIMESI2(UChi_10,result_20,Z0)\
|
||||
VACCTIMESI2(UChi_11,result_21,Z1)\
|
||||
VACCTIMESI2(UChi_12,result_22,Z2)\
|
||||
VACCTIMESI2(UChi_00,result_30,Z3)\
|
||||
VACCTIMESI2(UChi_01,result_31,Z4)\
|
||||
VACCTIMESI2(UChi_02,result_32,Z5)\
|
||||
VACCTIMESI1(UChi_10,psi_20,Z0)\
|
||||
VACCTIMESI1(UChi_00,psi_30,Z3)\
|
||||
VACCTIMESI1(UChi_11,psi_21,Z1)\
|
||||
VACCTIMESI1(UChi_01,psi_31,Z4)\
|
||||
VACCTIMESI1(UChi_12,psi_22,Z2)\
|
||||
VACCTIMESI1(UChi_02,psi_32,Z5)\
|
||||
VACCTIMESI2(UChi_10,psi_20,Z0)\
|
||||
VACCTIMESI2(UChi_11,psi_21,Z1)\
|
||||
VACCTIMESI2(UChi_12,psi_22,Z2)\
|
||||
VACCTIMESI2(UChi_00,psi_30,Z3)\
|
||||
VACCTIMESI2(UChi_01,psi_31,Z4)\
|
||||
VACCTIMESI2(UChi_02,psi_32,Z5)\
|
||||
);
|
||||
|
||||
#define YP_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VADD(UChi_10,result_20,result_20)\
|
||||
VADD(UChi_11,result_21,result_21)\
|
||||
VADD(UChi_12,result_22,result_22)\
|
||||
VSUB(UChi_00,result_30,result_30)\
|
||||
VSUB(UChi_01,result_31,result_31)\
|
||||
VSUB(UChi_02,result_32,result_32) );
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VADD(UChi_10,psi_20,psi_20)\
|
||||
VADD(UChi_11,psi_21,psi_21)\
|
||||
VADD(UChi_12,psi_22,psi_22)\
|
||||
VSUB(UChi_00,psi_30,psi_30)\
|
||||
VSUB(UChi_01,psi_31,psi_31)\
|
||||
VSUB(UChi_02,psi_32,psi_32) );
|
||||
|
||||
#define YM_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VSUB(UChi_10,result_20,result_20)\
|
||||
VSUB(UChi_11,result_21,result_21)\
|
||||
VSUB(UChi_12,result_22,result_22)\
|
||||
VADD(UChi_00,result_30,result_30)\
|
||||
VADD(UChi_01,result_31,result_31)\
|
||||
VADD(UChi_02,result_32,result_32) );
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VSUB(UChi_10,psi_20,psi_20)\
|
||||
VSUB(UChi_11,psi_21,psi_21)\
|
||||
VSUB(UChi_12,psi_22,psi_22)\
|
||||
VADD(UChi_00,psi_30,psi_30)\
|
||||
VADD(UChi_01,psi_31,psi_31)\
|
||||
VADD(UChi_02,psi_32,psi_32) );
|
||||
|
||||
#define ZP_RECON_ACCUM __asm__ ( \
|
||||
VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
|
||||
VACCTIMESI0(UChi_10,result_30,Z3)\
|
||||
VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
|
||||
VACCTIMESI0(UChi_11,result_31,Z4)\
|
||||
VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
|
||||
VACCTIMESI0(UChi_12,result_32,Z5)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
|
||||
VACCTIMESI1(UChi_10,result_30,Z3)\
|
||||
VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
|
||||
VACCTIMESI1(UChi_11,result_31,Z4)\
|
||||
VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
|
||||
VACCTIMESI1(UChi_12,result_32,Z5)\
|
||||
VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
|
||||
VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
|
||||
VACCTIMESMINUSI2(UChi_02,result_22,Z2)\
|
||||
VACCTIMESI2(UChi_10,result_30,Z3)\
|
||||
VACCTIMESI2(UChi_11,result_31,Z4)\
|
||||
VACCTIMESI2(UChi_12,result_32,Z5)\
|
||||
VACCTIMESMINUSI0(UChi_00,psi_20,Z0)\
|
||||
VACCTIMESI0(UChi_10,psi_30,Z3)\
|
||||
VACCTIMESMINUSI0(UChi_01,psi_21,Z1)\
|
||||
VACCTIMESI0(UChi_11,psi_31,Z4)\
|
||||
VACCTIMESMINUSI0(UChi_02,psi_22,Z2)\
|
||||
VACCTIMESI0(UChi_12,psi_32,Z5)\
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VACCTIMESMINUSI1(UChi_00,psi_20,Z0)\
|
||||
VACCTIMESI1(UChi_10,psi_30,Z3)\
|
||||
VACCTIMESMINUSI1(UChi_01,psi_21,Z1)\
|
||||
VACCTIMESI1(UChi_11,psi_31,Z4)\
|
||||
VACCTIMESMINUSI1(UChi_02,psi_22,Z2)\
|
||||
VACCTIMESI1(UChi_12,psi_32,Z5)\
|
||||
VACCTIMESMINUSI2(UChi_00,psi_20,Z0)\
|
||||
VACCTIMESMINUSI2(UChi_01,psi_21,Z1)\
|
||||
VACCTIMESMINUSI2(UChi_02,psi_22,Z2)\
|
||||
VACCTIMESI2(UChi_10,psi_30,Z3)\
|
||||
VACCTIMESI2(UChi_11,psi_31,Z4)\
|
||||
VACCTIMESI2(UChi_12,psi_32,Z5)\
|
||||
);
|
||||
|
||||
#define ZM_RECON_ACCUM __asm__ ( \
|
||||
VACCTIMESI0(UChi_00,result_20,Z0)\
|
||||
VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
|
||||
VACCTIMESI0(UChi_01,result_21,Z1)\
|
||||
VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
|
||||
VACCTIMESI0(UChi_02,result_22,Z2)\
|
||||
VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESI1(UChi_00,result_20,Z0)\
|
||||
VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
|
||||
VACCTIMESI1(UChi_01,result_21,Z1)\
|
||||
VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
|
||||
VACCTIMESI1(UChi_02,result_22,Z2)\
|
||||
VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
|
||||
VACCTIMESI2(UChi_00,result_20,Z0)\
|
||||
VACCTIMESI2(UChi_01,result_21,Z1)\
|
||||
VACCTIMESI2(UChi_02,result_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_10,result_30,Z3)\
|
||||
VACCTIMESMINUSI2(UChi_11,result_31,Z4)\
|
||||
VACCTIMESMINUSI2(UChi_12,result_32,Z5)\
|
||||
VACCTIMESI0(UChi_00,psi_20,Z0)\
|
||||
VACCTIMESMINUSI0(UChi_10,psi_30,Z3)\
|
||||
VACCTIMESI0(UChi_01,psi_21,Z1)\
|
||||
VACCTIMESMINUSI0(UChi_11,psi_31,Z4)\
|
||||
VACCTIMESI0(UChi_02,psi_22,Z2)\
|
||||
VACCTIMESMINUSI0(UChi_12,psi_32,Z5)\
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VACCTIMESI1(UChi_00,psi_20,Z0)\
|
||||
VACCTIMESMINUSI1(UChi_10,psi_30,Z3)\
|
||||
VACCTIMESI1(UChi_01,psi_21,Z1)\
|
||||
VACCTIMESMINUSI1(UChi_11,psi_31,Z4)\
|
||||
VACCTIMESI1(UChi_02,psi_22,Z2)\
|
||||
VACCTIMESMINUSI1(UChi_12,psi_32,Z5)\
|
||||
VACCTIMESI2(UChi_00,psi_20,Z0)\
|
||||
VACCTIMESI2(UChi_01,psi_21,Z1)\
|
||||
VACCTIMESI2(UChi_02,psi_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_10,psi_30,Z3)\
|
||||
VACCTIMESMINUSI2(UChi_11,psi_31,Z4)\
|
||||
VACCTIMESMINUSI2(UChi_12,psi_32,Z5)\
|
||||
);
|
||||
|
||||
#define TP_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VADD(UChi_00,result_20,result_20)\
|
||||
VADD(UChi_10,result_30,result_30)\
|
||||
VADD(UChi_01,result_21,result_21)\
|
||||
VADD(UChi_11,result_31,result_31)\
|
||||
VADD(UChi_02,result_22,result_22)\
|
||||
VADD(UChi_12,result_32,result_32) );
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VADD(UChi_00,psi_20,psi_20)\
|
||||
VADD(UChi_10,psi_30,psi_30)\
|
||||
VADD(UChi_01,psi_21,psi_21)\
|
||||
VADD(UChi_11,psi_31,psi_31)\
|
||||
VADD(UChi_02,psi_22,psi_22)\
|
||||
VADD(UChi_12,psi_32,psi_32) );
|
||||
|
||||
#define TM_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VSUB(UChi_00,result_20,result_20)\
|
||||
VSUB(UChi_10,result_30,result_30)\
|
||||
VSUB(UChi_01,result_21,result_21)\
|
||||
VSUB(UChi_11,result_31,result_31)\
|
||||
VSUB(UChi_02,result_22,result_22)\
|
||||
VSUB(UChi_12,result_32,result_32) );
|
||||
VADD(UChi_00,psi_00,psi_00)\
|
||||
VADD(UChi_10,psi_10,psi_10)\
|
||||
VADD(UChi_01,psi_01,psi_01)\
|
||||
VADD(UChi_11,psi_11,psi_11)\
|
||||
VADD(UChi_02,psi_02,psi_02)\
|
||||
VADD(UChi_12,psi_12,psi_12)\
|
||||
VSUB(UChi_00,psi_20,psi_20)\
|
||||
VSUB(UChi_10,psi_30,psi_30)\
|
||||
VSUB(UChi_01,psi_21,psi_21)\
|
||||
VSUB(UChi_11,psi_31,psi_31)\
|
||||
VSUB(UChi_02,psi_22,psi_22)\
|
||||
VSUB(UChi_12,psi_32,psi_32) );
|
||||
|
||||
#define AVX512_PF_L1
|
||||
#define AVX512_PF_L2_GAUGE
|
||||
@ -580,22 +571,62 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
LOAD64(%r8,PTR) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
VSTORE(0,%r8,result_00) VPREFETCH_M1(0,%r9) \
|
||||
VSTORE(1,%r8,result_01) VPREFETCH_M1(1,%r9) \
|
||||
VSTORE(2,%r8,result_02) VPREFETCH_M1(2,%r9) \
|
||||
VSTORE(3,%r8,result_10) VPREFETCH_M1(3,%r9) \
|
||||
VSTORE(4,%r8,result_11) VPREFETCH_M1(4,%r9) \
|
||||
VSTORE(5,%r8,result_12) VPREFETCH_M1(5,%r9) \
|
||||
VSTORE(6,%r8,result_20) VPREFETCH_M1(6,%r9) \
|
||||
VSTORE(7,%r8,result_21) VPREFETCH_M1(7,%r9) \
|
||||
VSTORE(8,%r8,result_22) VPREFETCH_M1(8,%r9) \
|
||||
VSTORE(9,%r8,result_30) VPREFETCH_M1(9,%r9) \
|
||||
VSTORE(10,%r8,result_31) VPREFETCH_M1(10,%r9) \
|
||||
VSTORE(11,%r8,result_32) VPREFETCH_M1(11,%r9) \
|
||||
VSTORE(0,%r8,psi_00) VPREFETCH_M1(0,%r9) \
|
||||
VSTORE(1,%r8,psi_01) VPREFETCH_M1(1,%r9) \
|
||||
VSTORE(2,%r8,psi_02) VPREFETCH_M1(2,%r9) \
|
||||
VSTORE(3,%r8,psi_10) VPREFETCH_M1(3,%r9) \
|
||||
VSTORE(4,%r8,psi_11) VPREFETCH_M1(4,%r9) \
|
||||
VSTORE(5,%r8,psi_12) VPREFETCH_M1(5,%r9) \
|
||||
VSTORE(6,%r8,psi_20) VPREFETCH_M1(6,%r9) \
|
||||
VSTORE(7,%r8,psi_21) VPREFETCH_M1(7,%r9) \
|
||||
VSTORE(8,%r8,psi_22) VPREFETCH_M1(8,%r9) \
|
||||
VSTORE(9,%r8,psi_30) VPREFETCH_M1(9,%r9) \
|
||||
VSTORE(10,%r8,psi_31) VPREFETCH_M1(10,%r9) \
|
||||
VSTORE(11,%r8,psi_32) VPREFETCH_M1(11,%r9) \
|
||||
);
|
||||
|
||||
#define ADD_RESULTi(PTR,pf) \
|
||||
LOAD_CHIMU(PTR); \
|
||||
asm(VADD(psi_00,Chimu_00,psi_00) VADD(psi_01,Chimu_01,psi_01) VADD(psi_02,Chimu_02,psi_02) \
|
||||
VADD(psi_10,Chimu_10,psi_10) VADD(psi_11,Chimu_11,psi_11) VADD(psi_12,Chimu_12,psi_12) \
|
||||
VADD(psi_20,Chimu_20,psi_20) VADD(psi_21,Chimu_21,psi_21) VADD(psi_22,Chimu_22,psi_22) \
|
||||
VADD(psi_30,Chimu_30,psi_30) VADD(psi_31,Chimu_31,psi_31) VADD(psi_32,Chimu_32,psi_32) ); \
|
||||
SAVE_RESULT(PTR,pf);
|
||||
|
||||
|
||||
|
||||
#define ADD_RESULTia(PTR,pf) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
VADDMEM(0,%r8,psi_00,psi_00) \
|
||||
VADDMEM(1,%r8,psi_01,psi_01) \
|
||||
VADDMEM(2,%r8,psi_02,psi_02) \
|
||||
VADDMEM(3,%r8,psi_10,psi_10) \
|
||||
VADDMEM(4,%r8,psi_11,psi_11) \
|
||||
VADDMEM(5,%r8,psi_12,psi_12) \
|
||||
VADDMEM(6,%r8,psi_20,psi_20) \
|
||||
VADDMEM(7,%r8,psi_21,psi_21) \
|
||||
VADDMEM(8,%r8,psi_22,psi_22) \
|
||||
VADDMEM(9,%r8,psi_30,psi_30) \
|
||||
VADDMEM(10,%r8,psi_31,psi_31) \
|
||||
VADDMEM(11,%r8,psi_32,psi_32) \
|
||||
VSTORE(0,%r8,psi_00) \
|
||||
VSTORE(1,%r8,psi_01) \
|
||||
VSTORE(2,%r8,psi_02) \
|
||||
VSTORE(3,%r8,psi_10) \
|
||||
VSTORE(4,%r8,psi_11) \
|
||||
VSTORE(5,%r8,psi_12) \
|
||||
VSTORE(6,%r8,psi_20) \
|
||||
VSTORE(7,%r8,psi_21) \
|
||||
VSTORE(8,%r8,psi_22) \
|
||||
VSTORE(9,%r8,psi_30) \
|
||||
VSTORE(10,%r8,psi_31) \
|
||||
VSTORE(11,%r8,psi_32) \
|
||||
);
|
||||
|
||||
|
||||
#ifdef AVX512_PF_L2_TABLE
|
||||
#define PREFETCH_CHIMU(A) \
|
||||
#define PREFETCH_CHIMU(A) \
|
||||
LOAD64(%r9,A) \
|
||||
__asm__ ( \
|
||||
VPREFETCH_P1(0,%r9) \
|
||||
|
255
lib/simd/Simd.h
Normal file
255
lib/simd/Simd.h
Normal file
@ -0,0 +1,255 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/Simd.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: neo <cossu@post.kek.jp>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_SIMD_H
|
||||
#define GRID_SIMD_H
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Define scalar and vector floating point types
|
||||
//
|
||||
// Scalar: RealF, RealD, ComplexF, ComplexD
|
||||
//
|
||||
// Vector: vRealF, vRealD, vComplexF, vComplexD
|
||||
//
|
||||
// Vector types are arch dependent
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
|
||||
#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
|
||||
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
|
||||
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
|
||||
#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)
|
||||
|
||||
#define RotateBit (0x100)
|
||||
|
||||
namespace Grid {
|
||||
|
||||
typedef uint32_t Integer;
|
||||
|
||||
typedef float RealF;
|
||||
typedef double RealD;
|
||||
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
|
||||
typedef RealD Real;
|
||||
#else
|
||||
typedef RealF Real;
|
||||
#endif
|
||||
|
||||
typedef std::complex<RealF> ComplexF;
|
||||
typedef std::complex<RealD> ComplexD;
|
||||
typedef std::complex<Real> Complex;
|
||||
|
||||
inline RealF adj(const RealF & r){ return r; }
|
||||
inline RealF conjugate(const RealF & r){ return r; }
|
||||
inline RealF real(const RealF & r){ return r; }
|
||||
|
||||
inline RealD adj(const RealD & r){ return r; }
|
||||
inline RealD conjugate(const RealD & r){ return r; }
|
||||
inline RealD real(const RealD & r){ return r; }
|
||||
|
||||
inline RealD sqrt(const RealD & r){ return std::sqrt(r); }
|
||||
|
||||
inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }
|
||||
inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
|
||||
inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }
|
||||
inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }
|
||||
|
||||
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; }
|
||||
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
|
||||
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
|
||||
inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
|
||||
|
||||
inline ComplexD Reduce(const ComplexD& r){ return r; }
|
||||
inline ComplexF Reduce(const ComplexF& r){ return r; }
|
||||
inline RealD Reduce(const RealD& r){ return r; }
|
||||
inline RealF Reduce(const RealF& r){ return r; }
|
||||
|
||||
inline RealD toReal(const ComplexD& r){ return real(r); }
|
||||
inline RealF toReal(const ComplexF& r){ return real(r); }
|
||||
inline RealD toReal(const RealD& r){ return r; }
|
||||
inline RealF toReal(const RealF& r){ return r; }
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//Provide support functions for basic real and complex data types required by Grid
|
||||
//Single and double precision versions. Should be able to template this once only.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
|
||||
inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
|
||||
inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
|
||||
inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
|
||||
// conjugate already supported for complex
|
||||
|
||||
inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
|
||||
inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
//conjugate already supported for complex
|
||||
|
||||
inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
|
||||
inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
|
||||
inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
|
||||
inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
|
||||
|
||||
// define projections to real and imaginay parts
|
||||
inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
|
||||
inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
|
||||
inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
|
||||
inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
|
||||
|
||||
// define auxiliary functions for complex computations
|
||||
inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);}
|
||||
inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);}
|
||||
inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
|
||||
inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}
|
||||
|
||||
inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);}
|
||||
inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
|
||||
inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
|
||||
inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
|
||||
|
||||
inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
|
||||
inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
inline void vstream(ComplexF &l, const ComplexF &r){ l=r;}
|
||||
inline void vstream(ComplexD &l, const ComplexD &r){ l=r;}
|
||||
inline void vstream(RealF &l, const RealF &r){ l=r;}
|
||||
inline void vstream(RealD &l, const RealD &r){ l=r;}
|
||||
|
||||
|
||||
class Zero{};
|
||||
static Zero zero;
|
||||
template<class itype> inline void zeroit(itype &arg){ arg=zero;};
|
||||
template<> inline void zeroit(ComplexF &arg){ arg=0; };
|
||||
template<> inline void zeroit(ComplexD &arg){ arg=0; };
|
||||
template<> inline void zeroit(RealF &arg){ arg=0; };
|
||||
template<> inline void zeroit(RealD &arg){ arg=0; };
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// Permute
|
||||
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
||||
// Permute 1 every ABCDEFGH -> CD AB GH EF
|
||||
// Permute 2 every ABCDEFGH -> EFGH ABCD
|
||||
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
|
||||
// Permute 4 possible on half precision @512bit vectors.
|
||||
//
|
||||
// Defined inside SIMD specialization files
|
||||
//////////////////////////////////////////////////////////
|
||||
template<class VectorSIMD>
|
||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm);
|
||||
|
||||
};
|
||||
|
||||
#include <Grid/simd/Grid_vector_types.h>
|
||||
#include <Grid/simd/Grid_vector_unops.h>
|
||||
|
||||
namespace Grid {
|
||||
// Default precision
|
||||
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
|
||||
typedef vRealD vReal;
|
||||
typedef vComplexD vComplex;
|
||||
#else
|
||||
typedef vRealF vReal;
|
||||
typedef vComplexF vComplex;
|
||||
#endif
|
||||
|
||||
|
||||
inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){
|
||||
int nn=vComplexF::Nsimd();
|
||||
std::vector<ComplexF,alignedAllocator<ComplexF> > buf(nn);
|
||||
vstore(o,&buf[0]);
|
||||
stream<<"<";
|
||||
for(int i=0;i<nn;i++){
|
||||
stream<<buf[i];
|
||||
if(i<nn-1) stream<<",";
|
||||
}
|
||||
stream<<">";
|
||||
return stream;
|
||||
}
|
||||
|
||||
inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){
|
||||
int nn=vComplexD::Nsimd();
|
||||
std::vector<ComplexD,alignedAllocator<ComplexD> > buf(nn);
|
||||
vstore(o,&buf[0]);
|
||||
stream<<"<";
|
||||
for(int i=0;i<nn;i++){
|
||||
stream<<buf[i];
|
||||
if(i<nn-1) stream<<",";
|
||||
}
|
||||
stream<<">";
|
||||
return stream;
|
||||
}
|
||||
|
||||
inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){
|
||||
int nn=vRealF::Nsimd();
|
||||
std::vector<RealF,alignedAllocator<RealF> > buf(nn);
|
||||
vstore(o,&buf[0]);
|
||||
stream<<"<";
|
||||
for(int i=0;i<nn;i++){
|
||||
stream<<buf[i];
|
||||
if(i<nn-1) stream<<",";
|
||||
}
|
||||
stream<<">";
|
||||
return stream;
|
||||
}
|
||||
|
||||
inline std::ostream& operator<< (std::ostream& stream, const vRealD &o){
|
||||
int nn=vRealD::Nsimd();
|
||||
std::vector<RealD,alignedAllocator<RealD> > buf(nn);
|
||||
vstore(o,&buf[0]);
|
||||
stream<<"<";
|
||||
for(int i=0;i<nn;i++){
|
||||
stream<<buf[i];
|
||||
if(i<nn-1) stream<<",";
|
||||
}
|
||||
stream<<">";
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<< (std::ostream& stream, const vInteger &o){
|
||||
int nn=vInteger::Nsimd();
|
||||
std::vector<Integer,alignedAllocator<Integer> > buf(nn);
|
||||
vstore(o,&buf[0]);
|
||||
stream<<"<";
|
||||
for(int i=0;i<nn;i++){
|
||||
stream<<buf[i];
|
||||
if(i<nn-1) stream<<",";
|
||||
}
|
||||
stream<<">";
|
||||
return stream;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
#endif
|
Reference in New Issue
Block a user