mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-13 12:47:05 +01:00
Large change with KNL preparation
This commit is contained in:
@ -367,6 +367,9 @@ namespace Grid {
|
||||
template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret) { vsplat(ret,S(0.0,0.0)); }// use xor?
|
||||
template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));}
|
||||
|
||||
template <class S,class V, IfComplex<S> = 0 > inline void visign(Grid_simd<S,V> &ret){ vsplat(ret,S(1.0,-1.0));}
|
||||
template <class S,class V, IfComplex<S> = 0 > inline void vrsign(Grid_simd<S,V> &ret){ vsplat(ret,S(-1.0,1.0));}
|
||||
|
||||
// if not complex overload here
|
||||
template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
|
||||
template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
|
||||
|
@ -87,14 +87,39 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||
#define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2
|
||||
|
||||
#define VMOVRDUPf(OFF,PTR,DEST) "vmovsldup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
#define VMOVIDUPf(OFF,PTR,DEST) "vmovshdup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
|
||||
#define VRDUPd(SRC,DEST) "vpshufd $0x44," #SRC"," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||
#define VRDUPf(SRC,DEST) "vmovsldup " #SRC ", " #DEST ";\n"
|
||||
#define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2
|
||||
#define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n"
|
||||
|
||||
#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n"
|
||||
#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n"
|
||||
#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n"
|
||||
#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n"
|
||||
|
||||
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n"
|
||||
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n"
|
||||
#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps " #O"*64("#P "),"#B "," #accum ";\n"
|
||||
#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n"
|
||||
|
||||
|
||||
#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
|
||||
#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
|
||||
#define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
|
||||
#define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
|
||||
|
||||
#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
|
||||
#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
|
||||
#define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
|
||||
#define VMULIDUPd(O,P,B,accum) "vmulpd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
|
||||
/*
|
||||
* TimesI is used only in the XP recon
|
||||
* Could zero the regs and use RECON_ACCUM
|
||||
*/
|
||||
|
||||
|
||||
#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
|
||||
#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
@ -111,6 +136,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#if 0
|
||||
|
||||
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
@ -127,6 +154,35 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#else
|
||||
|
||||
// o_p must point to floating 1.0f/d
|
||||
//
|
||||
// Ai, Ar -> tmp (r i)
|
||||
// tmp *1.0
|
||||
// ACC i - Ar ; ACC r + Ai
|
||||
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) VMADDMEMf(1,%r10,tmp,ACC)
|
||||
#define VACCTIMESMINUSI2f(A,ACC,tmp)
|
||||
|
||||
|
||||
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESMINUSI1d(A,ACC,tmp) VMADDMEMd(1,%r10,tmp,ACC)
|
||||
#define VACCTIMESMINUSI2d(A,ACC,tmp)
|
||||
|
||||
// Ai, Ar -> tmp (r i)
|
||||
// tmp *1.0
|
||||
// ACC i + Ar ; ACC r - Ai
|
||||
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESI1f(A,ACC,tmp) VMADDMEMf(0,%r10,tmp,ACC)
|
||||
#define VACCTIMESI2f(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESI1d(A,ACC,tmp) VMADDMEMd(0,%r10,tmp,ACC)
|
||||
#define VACCTIMESI2d(A,ACC,tmp)
|
||||
|
||||
#endif
|
||||
|
||||
#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n"
|
||||
#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n"
|
||||
#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n"
|
||||
|
@ -1,92 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_AV512_ADDSUB_H
|
||||
#define GRID_ASM_AV512_ADDSUB_H
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Building blocks for SU3 x 2spinor
|
||||
// Load columns of U
|
||||
// 18 U DUP's rr/ii
|
||||
// 6 Chi shuffles ir,ri
|
||||
// 6muls, 30 fmaddsubs
|
||||
////////////////////////////////////////////////////////////////
|
||||
#define MULT_ADDSUB_2SPIN(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
VMOVIDUPf(0,%r8,Z0 ) \
|
||||
VMOVIDUPf(3,%r8,Z1 )\
|
||||
VMOVIDUPf(6,%r8,Z2 )\
|
||||
VSHUFf(Chi_00,T1) \
|
||||
VSHUFf(Chi_10,T2) \
|
||||
\
|
||||
VMULf(Z0,T1,UChi_00) VMOVRDUPf(0,%r8,Z3 ) \
|
||||
VMULf(Z0,T2,UChi_10) VMOVRDUPf(3,%r8,Z4 ) \
|
||||
VMULf(Z1,T1,UChi_01) VMOVRDUPf(6,%r8,Z5 ) \
|
||||
VMULf(Z1,T2,UChi_11) VMOVIDUPf(1,%r8,Z0 ) \
|
||||
VMULf(Z2,T1,UChi_02) VMOVIDUPf(4,%r8,Z1 ) \
|
||||
VMULf(Z2,T2,UChi_12) VMOVIDUPf(7,%r8,Z2 ) \
|
||||
\
|
||||
VMADDSUBf(Z3,Chi_00,UChi_00) VSHUFf(Chi_01,T1) \
|
||||
VMADDSUBf(Z3,Chi_10,UChi_10) VSHUFf(Chi_11,T2) \
|
||||
VMADDSUBf(Z4,Chi_00,UChi_01) VMOVRDUPf(1,%r8,Z3 ) \
|
||||
VMADDSUBf(Z4,Chi_10,UChi_11)\
|
||||
VMADDSUBf(Z5,Chi_00,UChi_02) VMOVRDUPf(4,%r8,Z4 ) \
|
||||
VMADDSUBf(Z5,Chi_10,UChi_12)\
|
||||
\
|
||||
VMADDSUBf(Z0,T1,UChi_00) VMOVRDUPf(7,%r8,Z5 ) \
|
||||
VMADDSUBf(Z0,T2,UChi_10)\
|
||||
VMADDSUBf(Z1,T1,UChi_01) VMOVIDUPf(2,%r8,Z0 ) \
|
||||
VMADDSUBf(Z1,T2,UChi_11)\
|
||||
VMADDSUBf(Z2,T1,UChi_02) VMOVIDUPf(5,%r8,Z1 ) \
|
||||
VMADDSUBf(Z2,T2,UChi_12) VMOVIDUPf(8,%r8,Z2 ) \
|
||||
\
|
||||
VMADDSUBf(Z3,Chi_01,UChi_00) VSHUFf(Chi_02,T1) \
|
||||
VMADDSUBf(Z3,Chi_11,UChi_10) VSHUFf(Chi_12,T2) \
|
||||
VMADDSUBf(Z4,Chi_01,UChi_01) VMOVRDUPf(2,%r8,Z3 ) \
|
||||
VMADDSUBf(Z4,Chi_11,UChi_11)\
|
||||
VMADDSUBf(Z5,Chi_01,UChi_02) VMOVRDUPf(5,%r8,Z4 ) \
|
||||
VMADDSUBf(Z5,Chi_11,UChi_12)\
|
||||
\
|
||||
VMADDSUBf(Z0,T1,UChi_00) VMOVRDUPf(8,%r8,Z5 ) \
|
||||
VMADDSUBf(Z0,T2,UChi_10)\
|
||||
VMADDSUBf(Z1,T1,UChi_01)\
|
||||
VMADDSUBf(Z1,T2,UChi_11)\
|
||||
VMADDSUBf(Z2,T1,UChi_02)\
|
||||
VMADDSUBf(Z2,T2,UChi_12)\
|
||||
\
|
||||
VMADDSUBf(Z3,Chi_02,UChi_00)\
|
||||
VMADDSUBf(Z3,Chi_12,UChi_10)\
|
||||
VMADDSUBf(Z4,Chi_02,UChi_01)\
|
||||
VMADDSUBf(Z4,Chi_12,UChi_11)\
|
||||
VMADDSUBf(Z5,Chi_02,UChi_02)\
|
||||
VMADDSUBf(Z5,Chi_12,UChi_12)\
|
||||
);
|
||||
|
||||
|
||||
#endif
|
@ -86,8 +86,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
|
||||
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
|
||||
|
||||
#define VPREFETCHG(O,A)
|
||||
#define VPREFETCHW(O,A)
|
||||
#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n"
|
||||
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
|
||||
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
|
||||
#define VEVICT(O,A)
|
||||
|
||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
||||
|
@ -133,3 +133,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
|
||||
|
||||
#undef VRDUP
|
||||
#undef VIDUP
|
||||
#undef VMADDSUBMEM
|
||||
#undef VMADDMEM
|
||||
#undef VMULMEM
|
||||
#define VRDUP(SRC,DEST) VRDUPd(SRC,DEST)
|
||||
#define VIDUP(SRC,DEST) VIDUPd(SRC,DEST)
|
||||
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
|
||||
#define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum)
|
||||
#define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum)
|
||||
#undef VMADDSUBRDUP
|
||||
#undef VMADDSUBIDUP
|
||||
#undef VMULRDUP
|
||||
#undef VMULIDUP
|
||||
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum)
|
||||
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum)
|
||||
#define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum)
|
||||
#define VMULIDUP(O,P,B,accum) VMULIDUPd(O,P,B,accum)
|
||||
|
@ -116,7 +116,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum)
|
||||
#define VSHUF(A,B) VSHUFf(A,B)
|
||||
|
||||
|
||||
#undef ZEND1
|
||||
#undef ZEND2
|
||||
#undef ZLOAD
|
||||
@ -133,3 +132,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
|
||||
#undef VRDUP
|
||||
#undef VIDUP
|
||||
#undef VMADDSUBMEM
|
||||
#undef VMADDMEM
|
||||
#undef VMULMEM
|
||||
|
||||
#define VRDUP(SRC,DEST) VRDUPf(SRC,DEST)
|
||||
#define VIDUP(SRC,DEST) VIDUPf(SRC,DEST)
|
||||
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
|
||||
#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
|
||||
#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
|
||||
|
||||
#undef VMADDSUBRDUP
|
||||
#undef VMADDSUBIDUP
|
||||
#undef VMULRDUP
|
||||
#undef VMULIDUP
|
||||
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum)
|
||||
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum)
|
||||
#define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum)
|
||||
#define VMULIDUP(O,P,B,accum) VMULIDUPf(O,P,B,accum)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
/*************************************************************************************
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
@ -27,9 +27,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_INTEL_512_QCD_H
|
||||
#define GRID_ASM_INTEL_512_QCD_H
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Register allocations for Wilson Kernel are precision and IMCI/AVX512 indept
|
||||
// Register allocations for Wilson Kernel are precision indept
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
#define result_00 %zmm0
|
||||
#define result_01 %zmm1
|
||||
@ -64,7 +64,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define UChi_12 %zmm23
|
||||
|
||||
#define Uir %zmm24
|
||||
//#define ONE %zmm24
|
||||
#define Uri %zmm25
|
||||
#define T1 %zmm24
|
||||
#define T2 %zmm25
|
||||
@ -92,13 +91,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define Chimu_32 UChi_12
|
||||
|
||||
#include <simd/Intel512common.h>
|
||||
#ifdef AVX512
|
||||
#include <simd/Intel512avx.h>
|
||||
//#include <simd/Intel512avxAddsub.h> // Alternate implementation
|
||||
#endif
|
||||
#ifdef IMCI
|
||||
#include <simd/Intel512imci.h>
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Macros used to build wilson kernel -- can rationalise and simplify
|
||||
@ -193,47 +186,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VSTORE(11,%r8,result_32) \
|
||||
);
|
||||
|
||||
// auto ptr = &U._odata[sU](A);
|
||||
// A plan for lifting loads
|
||||
// can use Z2/3/4/5/U/U for U field in first step.
|
||||
// can use Chi_00, Chi_10, U U for U field in second step
|
||||
// can use Chi_00, Chi_10, Chi_01,11, U U for U field in third step
|
||||
// Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
|
||||
// KNL is DUAL issue for FP, and lifting these loads is potentially important.
|
||||
// Need detailed profile data to be sure.
|
||||
#if 0
|
||||
#define PREFETCH_U(A) \
|
||||
LOAD64(%r8,&U._odata[sU](A)) \
|
||||
__asm__ ( \
|
||||
VPREFETCHG(0,%r8) \
|
||||
VPREFETCHG(1,%r8) \
|
||||
VPREFETCHG(2,%r8) \
|
||||
VPREFETCHG(3,%r8) \
|
||||
VPREFETCHG(4,%r8) \
|
||||
VPREFETCHG(5,%r8) \
|
||||
VPREFETCHG(6,%r8) \
|
||||
VPREFETCHG(7,%r8) \
|
||||
VPREFETCHG(8,%r8) );
|
||||
|
||||
#define PREFETCH_R(A) \
|
||||
LOAD64(%r8,&out._odata[ss]) \
|
||||
__asm__ ( \
|
||||
VPREFETCHW(0,%r8) \
|
||||
VPREFETCHW(1,%r8) \
|
||||
VPREFETCHW(2,%r8) \
|
||||
VPREFETCHW(3,%r8) \
|
||||
VPREFETCHW(4,%r8) \
|
||||
VPREFETCHW(5,%r8) \
|
||||
VPREFETCHW(6,%r8) \
|
||||
VPREFETCHW(7,%r8) \
|
||||
VPREFETCHW(8,%r8) \
|
||||
VPREFETCHW(9,%r8) \
|
||||
VPREFETCHW(10,%r8) \
|
||||
VPREFETCHW(11,%r8) );
|
||||
#endif
|
||||
|
||||
#define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
|
||||
|
||||
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p)
|
||||
@ -244,131 +196,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
|
||||
|
||||
#if 0
|
||||
#define MULT_2SPIN_UNOPT(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
ZLOAD (0,%r8,UChi_01,UChi_11) \
|
||||
ZLOAD (3,%r8,UChi_02,UChi_12) \
|
||||
ZLOAD (6,%r8,Uri,Uir) \
|
||||
ZMUL (UChi_01,UChi_11,Chi_00,UChi_00,Z0) \
|
||||
ZMUL (UChi_01,UChi_11,Chi_10,UChi_10,Z1) \
|
||||
ZMUL (UChi_02,UChi_12,Chi_00,UChi_01,Z2) \
|
||||
ZMUL (UChi_02,UChi_12,Chi_10,UChi_11,Z3) \
|
||||
ZMUL (Uri,Uir, Chi_00,UChi_02,Z4) \
|
||||
ZMUL (Uri,Uir, Chi_10,UChi_12,Z5) \
|
||||
\
|
||||
ZLOAD (1,%r8,Uri,Uir) \
|
||||
ZLOAD (4,%r8,Chi_00, Chi_10) \
|
||||
ZMADD (Uri,Uir, Chi_01,UChi_00,Z0) \
|
||||
ZMADD (Uri,Uir, Chi_11,UChi_10,Z1) \
|
||||
ZLOAD (7,%r8,Uri,Uir) \
|
||||
ZMADD (Chi_00, Chi_10,Chi_01,UChi_01,Z2) \
|
||||
ZMADD (Chi_00, Chi_10,Chi_11,UChi_11,Z3) \
|
||||
ZLOAD (2,%r8,Chi_00,Chi_10) \
|
||||
ZMADD(Uri,Uir, Chi_01,UChi_02,Z4) \
|
||||
ZMADD(Uri,Uir, Chi_11,UChi_12,Z5) \
|
||||
\
|
||||
ZLOAD (5,%r8,Uri,Uir) \
|
||||
ZMADD (Chi_00,Chi_10, Chi_02,UChi_00,Z0) \
|
||||
ZMADD (Chi_00,Chi_10, Chi_12,UChi_10,Z1) \
|
||||
ZLOAD (8,%r8,Chi_00,Chi_10) \
|
||||
ZMADD (Uri,Uir, Chi_02,UChi_01,Z2) \
|
||||
ZMADD (Uri,Uir, Chi_12,UChi_11,Z3) \
|
||||
ZMADD(Chi_00,Chi_10, Chi_02,UChi_02,Z4) \
|
||||
ZMADD(Chi_00,Chi_10, Chi_12,UChi_12,Z5) \
|
||||
\
|
||||
ZEND1(UChi_00,Z0,Chi_01) \
|
||||
ZEND1(UChi_10,Z1,Chi_11) \
|
||||
ZEND1(UChi_01,Z2,Chi_00) \
|
||||
ZEND1(UChi_11,Z3,Chi_10) \
|
||||
ZEND1(UChi_02,Z4,Chi_02) \
|
||||
ZEND1(UChi_12,Z5,Chi_12) \
|
||||
ZEND2(UChi_00,Z0,Chi_01) \
|
||||
ZEND2(UChi_10,Z1,Chi_11) \
|
||||
ZEND2(UChi_01,Z2,Chi_00) \
|
||||
ZEND2(UChi_11,Z3,Chi_10) \
|
||||
ZEND2(UChi_02,Z4,Chi_02) \
|
||||
ZEND2(UChi_12,Z5,Chi_12) );
|
||||
#endif
|
||||
|
||||
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
|
||||
|
||||
// MULT_2SPINa(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
|
||||
|
||||
#if 0
|
||||
#define MULT_2SPIN_PF(ptr,pf,VPF) \
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
ZMULMEM2SP(0,%r8,Uri,Chi_00,Chi_10,UChi_00,Z0,UChi_10,Z1) \
|
||||
VPF(0,%r9) \
|
||||
ZMULMEM2SP(3,%r8,Uri,Chi_00,Chi_10,UChi_01,Z2,UChi_11,Z3) \
|
||||
VPF(1,%r9) \
|
||||
ZMULMEM2SP(6,%r8,Uri,Chi_00,Chi_10,UChi_02,Z4,UChi_12,Z5) \
|
||||
VPF(2,%r9) \
|
||||
\
|
||||
ZMADDMEM2SP(1,%r8,Uri,Chi_01,Chi_11,UChi_00,Z0,UChi_10,Z1) \
|
||||
VPF(3,%r9) \
|
||||
ZMADDMEM2SP(4,%r8,Uri,Chi_01,Chi_11,UChi_01,Z2,UChi_11,Z3) \
|
||||
VPF(4,%r9) \
|
||||
ZMADDMEM2SP(7,%r8,Uri,Chi_01,Chi_11,UChi_02,Z4,UChi_12,Z5) \
|
||||
VPF(5,%r9) \
|
||||
\
|
||||
ZMADDMEM2SP(2,%r8,Uri,Chi_02,Chi_12,UChi_00,Z0,UChi_10,Z1) \
|
||||
VPF(6,%r9) \
|
||||
ZMADDMEM2SP(5,%r8,Uri,Chi_02,Chi_12,UChi_01,Z2,UChi_11,Z3) \
|
||||
VPF(7,%r9) \
|
||||
ZMADDMEM2SP(8,%r8,Uri,Chi_02,Chi_12,UChi_02,Z4,UChi_12,Z5) \
|
||||
VPF(8,%r9) \
|
||||
\
|
||||
ZEND1(UChi_00,Z0,Chi_01) \
|
||||
ZEND1(UChi_10,Z1,Chi_11) \
|
||||
ZEND1(UChi_01,Z2,Chi_00) \
|
||||
ZEND1(UChi_11,Z3,Chi_10) \
|
||||
VPF(9,%r9) \
|
||||
ZEND1(UChi_02,Z4,Chi_02) \
|
||||
ZEND1(UChi_12,Z5,Chi_12) \
|
||||
ZEND2(UChi_00,Z0,Chi_01) \
|
||||
ZEND2(UChi_10,Z1,Chi_11) \
|
||||
VPF(10,%r9) \
|
||||
ZEND2(UChi_01,Z2,Chi_00) \
|
||||
ZEND2(UChi_11,Z3,Chi_10) \
|
||||
ZEND2(UChi_02,Z4,Chi_02) \
|
||||
VPF(11,%r9) \
|
||||
ZEND2(UChi_12,Z5,Chi_12) );
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define MULT_2SPIN_PFNONE(ptr,pf,VPF) \
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
VPF(0,%r9) \
|
||||
VPF(1,%r9) \
|
||||
VPF(2,%r9) \
|
||||
\
|
||||
VPF(3,%r9) \
|
||||
VPF(4,%r9) \
|
||||
VPF(5,%r9) \
|
||||
\
|
||||
VPF(6,%r9) \
|
||||
VPF(7,%r9) \
|
||||
VPF(8,%r9) \
|
||||
\
|
||||
VPF(9,%r9) \
|
||||
VPF(10,%r9) \
|
||||
VPF(11,%r9) );
|
||||
#endif
|
||||
|
||||
// Pretty much Perfectly Pipelined
|
||||
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Dirac algebra
|
||||
@ -490,7 +325,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIMU01i \
|
||||
VSUBMEM(6,%r8 ,Chimu_00,Chi_00) \
|
||||
VSUBMEM(6,%r8,Chimu_00,Chi_00) \
|
||||
VSUBMEM(7,%r8,Chimu_01,Chi_01) \
|
||||
VSUBMEM(8,%r8,Chimu_02,Chi_02) \
|
||||
VSUBMEM(9,%r8,Chimu_10,Chi_10) \
|
||||
@ -503,18 +338,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
// fspin(3)=timesMinusI(hspin(0))
|
||||
#define XP_RECON __asm__ ( \
|
||||
VZERO(TMP) \
|
||||
VMOV(UChi_00,result_00) \
|
||||
VMOV(UChi_01,result_01) \
|
||||
VMOV(UChi_02,result_02) \
|
||||
VMOV(UChi_10,result_10) \
|
||||
VMOV(UChi_11,result_11) \
|
||||
VMOV(UChi_12,result_12) \
|
||||
VTIMESMINUSI0(UChi_10,result_20,TMP) \
|
||||
VTIMESMINUSI0(UChi_11,result_21,TMP) \
|
||||
VTIMESMINUSI0(UChi_12,result_22,TMP) \
|
||||
VTIMESMINUSI0(UChi_00,result_30,TMP) \
|
||||
VTIMESMINUSI0(UChi_10,result_20,TMP) \
|
||||
VTIMESMINUSI0(UChi_01,result_31,TMP) \
|
||||
VTIMESMINUSI0(UChi_11,result_21,TMP) \
|
||||
VTIMESMINUSI0(UChi_02,result_32,TMP) \
|
||||
VTIMESMINUSI0(UChi_12,result_22,TMP) \
|
||||
VMOV(UChi_00,result_00) \
|
||||
VMOV(UChi_10,result_10) \
|
||||
VMOV(UChi_01,result_01) \
|
||||
VMOV(UChi_11,result_11) \
|
||||
VMOV(UChi_02,result_02) \
|
||||
VMOV(UChi_12,result_12) \
|
||||
VTIMESMINUSI1(UChi_10,result_20,TMP) \
|
||||
VTIMESMINUSI1(UChi_11,result_21,TMP) \
|
||||
VTIMESMINUSI1(UChi_12,result_22,TMP) \
|
||||
@ -531,24 +366,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
// NB could save 6 ops using addsub => 12 cycles
|
||||
#define XP_RECON_ACCUM __asm__ ( \
|
||||
VZERO(TMP)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
|
||||
VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
|
||||
VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
|
||||
VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
|
||||
VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
|
||||
VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
|
||||
VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
|
||||
VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
|
||||
VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
|
||||
VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
|
||||
VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
|
||||
VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
|
||||
VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
|
||||
VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
|
||||
VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
|
||||
VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
|
||||
VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
|
||||
VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
|
||||
VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
|
||||
VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
|
||||
@ -559,24 +394,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
#define XM_RECON __asm__ ( \
|
||||
VZERO(TMP)\
|
||||
VMOV(UChi_00,result_00)\
|
||||
VMOV(UChi_01,result_01)\
|
||||
VMOV(UChi_02,result_02)\
|
||||
VMOV(UChi_10,result_10)\
|
||||
VMOV(UChi_11,result_11)\
|
||||
VMOV(UChi_12,result_12)\
|
||||
VTIMESI0(UChi_10,result_20,TMP)\
|
||||
VTIMESI0(UChi_11,result_21,TMP)\
|
||||
VTIMESI0(UChi_12,result_22,TMP)\
|
||||
VTIMESI0(UChi_00,result_30,TMP)\
|
||||
VTIMESI0(UChi_10,result_20,TMP)\
|
||||
VTIMESI0(UChi_01,result_31,TMP)\
|
||||
VTIMESI0(UChi_11,result_21,TMP)\
|
||||
VTIMESI0(UChi_02,result_32,TMP)\
|
||||
VTIMESI1(UChi_10,result_20,TMP)\
|
||||
VTIMESI1(UChi_11,result_21,TMP)\
|
||||
VTIMESI1(UChi_12,result_22,TMP)\
|
||||
VTIMESI0(UChi_12,result_22,TMP)\
|
||||
VMOV(UChi_00,result_00)\
|
||||
VMOV(UChi_10,result_10)\
|
||||
VMOV(UChi_01,result_01)\
|
||||
VMOV(UChi_11,result_11)\
|
||||
VMOV(UChi_02,result_02)\
|
||||
VMOV(UChi_12,result_12)\
|
||||
VTIMESI1(UChi_00,result_30,TMP)\
|
||||
VTIMESI1(UChi_10,result_20,TMP)\
|
||||
VTIMESI1(UChi_01,result_31,TMP)\
|
||||
VTIMESI1(UChi_11,result_21,TMP)\
|
||||
VTIMESI1(UChi_02,result_32,TMP)\
|
||||
VTIMESI1(UChi_12,result_22,TMP)\
|
||||
VTIMESI2(UChi_10,result_20,TMP)\
|
||||
VTIMESI2(UChi_11,result_21,TMP)\
|
||||
VTIMESI2(UChi_12,result_22,TMP)\
|
||||
@ -586,23 +421,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
);
|
||||
|
||||
#define XM_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESI0(UChi_10,result_20,Z0)\
|
||||
VACCTIMESI0(UChi_11,result_21,Z1)\
|
||||
VACCTIMESI0(UChi_12,result_22,Z2)\
|
||||
VACCTIMESI0(UChi_00,result_30,Z3)\
|
||||
VACCTIMESI0(UChi_11,result_21,Z1)\
|
||||
VACCTIMESI0(UChi_01,result_31,Z4)\
|
||||
VACCTIMESI0(UChi_12,result_22,Z2)\
|
||||
VACCTIMESI0(UChi_02,result_32,Z5)\
|
||||
\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
\
|
||||
VACCTIMESI1(UChi_10,result_20,Z0)\
|
||||
VACCTIMESI1(UChi_11,result_21,Z1)\
|
||||
VACCTIMESI1(UChi_12,result_22,Z2)\
|
||||
VACCTIMESI1(UChi_00,result_30,Z3)\
|
||||
VACCTIMESI1(UChi_11,result_21,Z1)\
|
||||
VACCTIMESI1(UChi_01,result_31,Z4)\
|
||||
VACCTIMESI1(UChi_12,result_22,Z2)\
|
||||
VACCTIMESI1(UChi_02,result_32,Z5)\
|
||||
VACCTIMESI2(UChi_10,result_20,Z0)\
|
||||
VACCTIMESI2(UChi_11,result_21,Z1)\
|
||||
@ -614,10 +451,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
#define YP_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VADD(UChi_10,result_20,result_20)\
|
||||
VADD(UChi_11,result_21,result_21)\
|
||||
@ -628,10 +465,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
#define YM_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VSUB(UChi_10,result_20,result_20)\
|
||||
VSUB(UChi_11,result_21,result_21)\
|
||||
@ -641,23 +478,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VADD(UChi_02,result_32,result_32) );
|
||||
|
||||
#define ZP_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
|
||||
VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
|
||||
VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
|
||||
VACCTIMESI0(UChi_10,result_30,Z3)\
|
||||
VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
|
||||
VACCTIMESI0(UChi_11,result_31,Z4)\
|
||||
VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
|
||||
VACCTIMESI0(UChi_12,result_32,Z5)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
|
||||
VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
|
||||
VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
|
||||
VACCTIMESI1(UChi_10,result_30,Z3)\
|
||||
VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
|
||||
VACCTIMESI1(UChi_11,result_31,Z4)\
|
||||
VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
|
||||
VACCTIMESI1(UChi_12,result_32,Z5)\
|
||||
VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
|
||||
VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
|
||||
@ -668,23 +505,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
);
|
||||
|
||||
#define ZM_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESI0(UChi_00,result_20,Z0)\
|
||||
VACCTIMESI0(UChi_01,result_21,Z1)\
|
||||
VACCTIMESI0(UChi_02,result_22,Z2)\
|
||||
VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
|
||||
VACCTIMESI0(UChi_01,result_21,Z1)\
|
||||
VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
|
||||
VACCTIMESI0(UChi_02,result_22,Z2)\
|
||||
VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VACCTIMESI1(UChi_00,result_20,Z0)\
|
||||
VACCTIMESI1(UChi_01,result_21,Z1)\
|
||||
VACCTIMESI1(UChi_02,result_22,Z2)\
|
||||
VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
|
||||
VACCTIMESI1(UChi_01,result_21,Z1)\
|
||||
VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
|
||||
VACCTIMESI1(UChi_02,result_22,Z2)\
|
||||
VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
|
||||
VACCTIMESI2(UChi_00,result_20,Z0)\
|
||||
VACCTIMESI2(UChi_01,result_21,Z1)\
|
||||
@ -696,30 +533,30 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
#define TP_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VADD(UChi_00,result_20,result_20)\
|
||||
VADD(UChi_01,result_21,result_21)\
|
||||
VADD(UChi_02,result_22,result_22)\
|
||||
VADD(UChi_10,result_30,result_30)\
|
||||
VADD(UChi_01,result_21,result_21)\
|
||||
VADD(UChi_11,result_31,result_31)\
|
||||
VADD(UChi_02,result_22,result_22)\
|
||||
VADD(UChi_12,result_32,result_32) );
|
||||
|
||||
#define TM_RECON_ACCUM __asm__ ( \
|
||||
VADD(UChi_00,result_00,result_00)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_10,result_10,result_10)\
|
||||
VADD(UChi_01,result_01,result_01)\
|
||||
VADD(UChi_11,result_11,result_11)\
|
||||
VADD(UChi_02,result_02,result_02)\
|
||||
VADD(UChi_12,result_12,result_12)\
|
||||
VSUB(UChi_00,result_20,result_20)\
|
||||
VSUB(UChi_01,result_21,result_21)\
|
||||
VSUB(UChi_02,result_22,result_22)\
|
||||
VSUB(UChi_10,result_30,result_30)\
|
||||
VSUB(UChi_01,result_21,result_21)\
|
||||
VSUB(UChi_11,result_31,result_31)\
|
||||
VSUB(UChi_02,result_22,result_22)\
|
||||
VSUB(UChi_12,result_32,result_32) );
|
||||
|
||||
//define PREFETCH_CHIMU(A)
|
||||
@ -758,63 +595,200 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
#define MULT_ADDSUB_2SPIN1(ptr) \
|
||||
LOAD64(%r8,ptr)
|
||||
/*
|
||||
* __asm__ ( \
|
||||
);
|
||||
VMUL(Z0,%zmm2,%zmm3) \
|
||||
*/
|
||||
#define MULT_ADDSUB_2SPIN(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
VMOVIDUP(0,%r8,Z0 ) \
|
||||
VMOVIDUP(3,%r8,Z1 )\
|
||||
VMOVIDUP(6,%r8,Z2 )\
|
||||
VSHUF(Chi_00,T1) \
|
||||
VSHUF(Chi_10,T2) \
|
||||
\
|
||||
VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \
|
||||
VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \
|
||||
VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \
|
||||
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
|
||||
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
|
||||
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
|
||||
\
|
||||
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
|
||||
VMADDSUB(Z3,Chi_10,UChi_10) VSHUF(Chi_11,T2) \
|
||||
VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \
|
||||
VMADDSUB(Z4,Chi_10,UChi_11)\
|
||||
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
|
||||
VMADDSUB(Z5,Chi_10,UChi_12)\
|
||||
\
|
||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
|
||||
VMADDSUB(Z0,T2,UChi_10)\
|
||||
VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \
|
||||
VMADDSUB(Z1,T2,UChi_11)\
|
||||
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
|
||||
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
|
||||
\
|
||||
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
|
||||
VMADDSUB(Z3,Chi_11,UChi_10) VSHUF(Chi_12,T2) \
|
||||
VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \
|
||||
VMADDSUB(Z4,Chi_11,UChi_11)\
|
||||
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
|
||||
VMADDSUB(Z5,Chi_11,UChi_12)\
|
||||
\
|
||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
|
||||
VMADDSUB(Z0,T2,UChi_10)\
|
||||
VMADDSUB(Z1,T1,UChi_01)\
|
||||
VMADDSUB(Z1,T2,UChi_11)\
|
||||
VMADDSUB(Z2,T1,UChi_02)\
|
||||
VMADDSUB(Z2,T2,UChi_12)\
|
||||
\
|
||||
VMADDSUB(Z3,Chi_02,UChi_00)\
|
||||
VMADDSUB(Z3,Chi_12,UChi_10)\
|
||||
VMADDSUB(Z4,Chi_02,UChi_01)\
|
||||
VMADDSUB(Z4,Chi_12,UChi_11)\
|
||||
VMADDSUB(Z5,Chi_02,UChi_02)\
|
||||
VMADDSUB(Z5,Chi_12,UChi_12)\
|
||||
);
|
||||
|
||||
#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr)
|
||||
#define MULT_ADDSUB_2SPIN(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
VSHUF(Chi_00,T1) \
|
||||
VMOVIDUP(0,%r8,Z0 ) \
|
||||
VMOVIDUP(3,%r8,Z1 ) \
|
||||
VMOVIDUP(6,%r8,Z2 ) VSHUF(Chi_10,T2) \
|
||||
/*6*/ \
|
||||
VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \
|
||||
VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \
|
||||
VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \
|
||||
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
|
||||
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
|
||||
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
|
||||
/*18*/ \
|
||||
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
|
||||
VMADDSUB(Z3,Chi_10,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \
|
||||
VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \
|
||||
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
|
||||
VMADDSUB(Z5,Chi_10,UChi_12) \
|
||||
/*28*/ \
|
||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
|
||||
VMADDSUB(Z0,T2,UChi_10) \
|
||||
VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \
|
||||
VMADDSUB(Z1,T2,UChi_11) \
|
||||
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
|
||||
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
|
||||
/*38*/ \
|
||||
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
|
||||
VMADDSUB(Z3,Chi_11,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \
|
||||
VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \
|
||||
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
|
||||
VMADDSUB(Z5,Chi_11,UChi_12) \
|
||||
/*48*/ \
|
||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
|
||||
VMADDSUB(Z0,T2,UChi_10) \
|
||||
VMADDSUB(Z1,T1,UChi_01) \
|
||||
VMADDSUB(Z1,T2,UChi_11) \
|
||||
VMADDSUB(Z2,T1,UChi_02) \
|
||||
VMADDSUB(Z2,T2,UChi_12) \
|
||||
/*55*/ \
|
||||
VMADDSUB(Z3,Chi_02,UChi_00) \
|
||||
VMADDSUB(Z3,Chi_12,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_02,UChi_01) \
|
||||
VMADDSUB(Z4,Chi_12,UChi_11) \
|
||||
VMADDSUB(Z5,Chi_02,UChi_02) \
|
||||
VMADDSUB(Z5,Chi_12,UChi_12) \
|
||||
/*61 insns*/ );
|
||||
|
||||
|
||||
#define MULT_ADDSUB_2SPIN_LS(ptr,pf) \
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
|
||||
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
|
||||
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
|
||||
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
|
||||
VPREFETCHG(0,%r9) \
|
||||
VPREFETCHG(1,%r9) \
|
||||
VPREFETCHG(2,%r9) \
|
||||
VPREFETCHG(3,%r9) \
|
||||
/*8*/ \
|
||||
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
|
||||
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
|
||||
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
|
||||
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
|
||||
VPREFETCHG(4,%r9) \
|
||||
VPREFETCHG(5,%r9) \
|
||||
VPREFETCHG(6,%r9) \
|
||||
VPREFETCHG(7,%r9) \
|
||||
/*16*/ \
|
||||
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
|
||||
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
|
||||
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
|
||||
VPREFETCHG(8,%r9) \
|
||||
VPREFETCHG(9,%r9) \
|
||||
VPREFETCHG(10,%r9) \
|
||||
VPREFETCHG(11,%r9) \
|
||||
/*22*/ \
|
||||
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
|
||||
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
|
||||
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
|
||||
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
|
||||
VPREFETCH2(12,%r9) \
|
||||
VPREFETCH2(13,%r9) \
|
||||
VPREFETCH2(14,%r9) \
|
||||
VPREFETCH2(15,%r9) \
|
||||
/*30*/ \
|
||||
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
|
||||
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
|
||||
VPREFETCH2(16,%r9) \
|
||||
VPREFETCH2(17,%r9) \
|
||||
VPREFETCH2(18,%r9) \
|
||||
VPREFETCH2(19,%r9) \
|
||||
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
|
||||
/*36*/ \
|
||||
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
||||
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
||||
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
||||
VPREFETCH2(20,%r9) \
|
||||
VPREFETCH2(21,%r9) \
|
||||
VPREFETCH2(22,%r9) \
|
||||
VPREFETCH2(23,%r9) \
|
||||
VPREFETCHG(2,%r8) \
|
||||
VPREFETCHG(3,%r8) \
|
||||
VPREFETCH2(4,%r8) \
|
||||
VPREFETCH2(5,%r8) \
|
||||
/*42 insns*/ );
|
||||
|
||||
#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
|
||||
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
|
||||
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
|
||||
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
|
||||
/*8*/ \
|
||||
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
|
||||
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
|
||||
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
|
||||
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
|
||||
/*16*/ \
|
||||
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
|
||||
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
|
||||
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
|
||||
/*22*/ \
|
||||
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
|
||||
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
|
||||
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
|
||||
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
|
||||
/*30*/ \
|
||||
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
|
||||
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
|
||||
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
|
||||
/*36*/ \
|
||||
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
||||
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
||||
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
||||
/* VPREFETCHG(2,%r8)*/ \
|
||||
/* VPREFETCHG(3,%r8)*/ \
|
||||
/*42 insns*/ );
|
||||
|
||||
|
||||
#define Z6 Chi_00
|
||||
#define MULT_ADDSUB_2SPIN_NEW(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
VSHUFMEM(0,%r8,Z0) \
|
||||
VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \
|
||||
VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \
|
||||
VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \
|
||||
VSHUFMEM(3,%r8,Z0) \
|
||||
VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \
|
||||
VSHUFMEM(6,%r8,Z0) \
|
||||
VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \
|
||||
VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \
|
||||
VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \
|
||||
VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \
|
||||
/*11 cycles*/ \
|
||||
VSHUFMEM(1,%r8,Z0) \
|
||||
VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \
|
||||
VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \
|
||||
VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \
|
||||
VSHUFMEM(4,%r8,Z0) \
|
||||
VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \
|
||||
VSHUFMEM(7,%r8,Z0) \
|
||||
VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \
|
||||
VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \
|
||||
VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \
|
||||
VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \
|
||||
/*22 cycles*/ \
|
||||
VSHUFMEM(2,%r8,Z0) \
|
||||
VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \
|
||||
VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \
|
||||
VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \
|
||||
VSHUFMEM(5,%r8,Z0) \
|
||||
VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \
|
||||
VSHUFMEM(8,%r8,Z0) \
|
||||
VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \
|
||||
/*33 cycles*/ \
|
||||
VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \
|
||||
VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \
|
||||
VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \
|
||||
/*stall*/ \
|
||||
/*stall*/ \
|
||||
/*stall*/ \
|
||||
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
|
||||
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
|
||||
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) )
|
||||
|
||||
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user