mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
BGQ performance ASM
This commit is contained in:
parent
04ae7929a3
commit
eabf316ed9
162
lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
Normal file
162
lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
Normal file
@ -0,0 +1,162 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
|
||||
#if defined(AVX512)
|
||||
///////////////////////////////////////////////////////////
|
||||
// If we are AVX512 specialise the single precision routine
|
||||
///////////////////////////////////////////////////////////
|
||||
#include <simd/Intel512wilson.h>
|
||||
#include <simd/Intel512single.h>
|
||||
|
||||
static Vector<vComplexF> signsF;
|
||||
|
||||
template<typename vtype>
|
||||
int setupSigns(Vector<vtype>& signs ){
|
||||
Vector<vtype> bother(2);
|
||||
signs = bother;
|
||||
vrsign(signs[0]);
|
||||
visign(signs[1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int signInitF = setupSigns(signsF);
|
||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||
#define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0];
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, undag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, dag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
#define MAYBEPERM(A,B)
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, undag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, dag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
#undef COMPLEX_SIGNS
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// If we are AVX512 specialise the double precision routine
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
#include <simd/Intel512double.h>
|
||||
|
||||
static Vector<vComplexD> signsD;
|
||||
static int signInitD = setupSigns(signsD);
|
||||
|
||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||
#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT Vectorised, undag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT Vectorised, dag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
#define MAYBEPERM(A,B)
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, undag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, dag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
#undef COMPLEX_SIGNS
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
|
||||
#endif //AVX512
|
@ -7,12 +7,15 @@
|
||||
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||
//COMPLEX_TYPE is vComplexF of vComplexD depending
|
||||
//on the chosen precision
|
||||
COMPLEX_TYPE *isigns = &signs[0];
|
||||
|
||||
COMPLEX_SIGNS(isigns);
|
||||
MASK_REGS;
|
||||
int nmax=U._grid->oSites();
|
||||
for(int site=0;site<Ns;site++) {
|
||||
int sU =lo.Reorder(ssU);
|
||||
|
||||
LOCK_GAUGE(0);
|
||||
|
||||
|
||||
int ssn=ssU+1;
|
||||
if(ssn>=nmax) ssn=0;
|
||||
int sUn=lo.Reorder(ssn);
|
||||
@ -251,5 +254,6 @@
|
||||
|
||||
}
|
||||
ssU++;
|
||||
UNLOCK_GAUGE(0);
|
||||
}
|
||||
}
|
||||
|
146
lib/qcd/action/fermion/WilsonKernelsAsmQPX.h
Normal file
146
lib/qcd/action/fermion/WilsonKernelsAsmQPX.h
Normal file
@ -0,0 +1,146 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmQPX.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
|
||||
#if defined(QPX)
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// If we are QPX specialise the single precision routine
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
#include <simd/IBM_qpx.h>
|
||||
#include <simd/IBM_qpx_single.h>
|
||||
|
||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
|
||||
#define COMPLEX_SIGNS(isigns)
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, undag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, dag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
#define MAYBEPERM(A,B)
|
||||
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX_LS(ptr,pf)
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, undag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, dag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// DP routines
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
#include <simd/IBM_qpx_double.h>
|
||||
|
||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT Vectorised, undag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT Vectorised, dag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
#define MAYBEPERM(A,B)
|
||||
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX_LS(ptr,pf)
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, undag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, dag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
|
||||
#endif
|
619
lib/simd/IBM_qpx.h
Normal file
619
lib/simd/IBM_qpx.h
Normal file
@ -0,0 +1,619 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/BGQQPX.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_BGQ_QPX_H
|
||||
#define GRID_ASM_BGQ_QPX_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/*********************************************************
|
||||
* Register definitions
|
||||
*********************************************************/
|
||||
#define psi_00 0
|
||||
#define psi_01 1
|
||||
#define psi_02 2
|
||||
|
||||
#define psi_10 3
|
||||
#define psi_11 4
|
||||
#define psi_12 5
|
||||
|
||||
#define psi_20 6
|
||||
#define psi_21 7
|
||||
#define psi_22 8
|
||||
|
||||
#define psi_30 9
|
||||
#define psi_31 10
|
||||
#define psi_32 11
|
||||
|
||||
#define Chi_00 12
|
||||
#define Chi_01 13
|
||||
#define Chi_02 14
|
||||
|
||||
#define Chi_10 15
|
||||
#define Chi_11 16
|
||||
#define Chi_12 17
|
||||
|
||||
#define UChi_00 18
|
||||
#define UChi_01 19
|
||||
#define UChi_02 20
|
||||
|
||||
#define UChi_10 21
|
||||
#define UChi_11 22
|
||||
#define UChi_12 23
|
||||
|
||||
#define U0 24
|
||||
#define U1 25
|
||||
#define U2 26
|
||||
#define one 27
|
||||
#define perm_reg 28
|
||||
|
||||
#define REP %%r16
|
||||
#define IMM %%r17
|
||||
#define pREP %r16
|
||||
#define pIMM %r17
|
||||
|
||||
#define PPC_INST_DCBTLS 0x7c00014c
|
||||
#define PPC_INST_DCBLC 0x7c00030c
|
||||
#define __PPC_CT(t) (((t) & 0x0f) << 21)
|
||||
#define ___PPC_RA(a) (((a) & 0x1f) << 16)
|
||||
#define ___PPC_RB(b) (((b) & 0x1f) << 11)
|
||||
|
||||
#define LOCK_SET ".long (" HASH(PPC_INST_DCBTLS) "|" HASH(___PPC_RB(16)) ")\n"
|
||||
#define LOCK_CLEAR ".long (" HASH(PPC_INST_DCBLC) "|" HASH(___PPC_RB(16)) ")\n"
|
||||
|
||||
/*Alias regs for incoming fourspinor on neighbour site*/
|
||||
#define Chi_20 UChi_00
|
||||
#define Chi_21 UChi_01
|
||||
#define Chi_22 UChi_02
|
||||
#define Chi_30 UChi_10
|
||||
#define Chi_31 UChi_11
|
||||
#define Chi_32 UChi_12
|
||||
|
||||
/*********************************************************
|
||||
* Architectural macros
|
||||
*********************************************************/
|
||||
#define HASHit(A) #A
|
||||
#define HASH(A) HASHit(A)
|
||||
#define LOAD64(A,ptr)
|
||||
|
||||
|
||||
#define MASK_REGS /*NOOP ON BGQ*/
|
||||
#define PF_GAUGE(A) /*NOOP ON BGQ*/
|
||||
#define PREFETCH1_CHIMU(base) /*NOOP ON BGQ*/
|
||||
#define PREFETCH_CHIMU(base) /*NOOP ON BGQ*/
|
||||
|
||||
#define VLOADf(OFF,PTR,DEST) "qvlfsx " #DEST "," #PTR "," #OFF " ;\n"
|
||||
#define VLOADuf(OFF,PTR,DEST) "qvlfsux " #DEST "," #PTR "," #OFF " ;\n"
|
||||
#define VSTOREf(OFF,PTR,SRC) "qvstfsx " #SRC "," #PTR "," #OFF " ;\n"
|
||||
#define VSTOREuf(OFF,PTR,SRC) "qvstfsux " #SRC "," #PTR "," #OFF " ;\n"
|
||||
#define VSPLATf(A,B,DEST) "qvlfcsxa " #DEST "," #A "," #B ";\n"
|
||||
#define VSIZEf (16)
|
||||
|
||||
#define VPERMIi(p) "qvgpci " #p ", 1217;\n"
|
||||
#define VPERMi(A,p) "qvfperm " #A "," #A "," #A "," #p ";\n"
|
||||
#define VPERMI(p) VPERMIi(p)
|
||||
#define VPERM(A,p) VPERMi(A,p)
|
||||
|
||||
#define VLOADd(OFF,PTR,DEST) "qvlfdx " #DEST "," #PTR "," #OFF " ;\n"
|
||||
#define VLOADud(OFF,PTR,DEST) "qvlfdux " #DEST "," #PTR "," #OFF " ;\n"
|
||||
#define VSTOREd(OFF,PTR,SRC) "qvstfdx " #SRC "," #PTR "," #OFF " ;\n"
|
||||
#define VSTOREud(OFF,PTR,SRC) "qvstfdux " #SRC "," #PTR "," #OFF " ;\n"
|
||||
#define VSPLATd(A,B,DEST) "qvlfcdxa " #DEST "," #A "," #B ";\n"
|
||||
#define VSIZEd (32)
|
||||
|
||||
// QPX manual ordering QRT comes first (dest)
|
||||
#define VZEROi(DEST) "qvfset " #DEST "; \n qvfsub " #DEST "," #DEST "," #DEST ";\n"
|
||||
#define VONEi(DEST) "qvfset " #DEST "; \n"
|
||||
#define VMOVi(DEST,A) "qvfmr " #DEST "," #A ";\n"
|
||||
#define VADDi(DEST,A,B) "qvfadd " #DEST "," #A "," #B ";\n"
|
||||
#define VSUBi(DEST,A,B) "qvfsub " #DEST "," #A "," #B ";\n"
|
||||
#define VMULi(DEST,A,B) "qvfmul " #DEST "," #A "," #B ";\n"
|
||||
#define VMUL_RR_RIi(DEST,A,B) "qvfxmul " #DEST "," #A "," #B ";\n"
|
||||
#define VMADDi(DEST,A,B,C) "qvfmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
#define VMADD_RR_RIi(DEST,A,B,C) "qvfxmadd " #DEST "," #A "," #B ","#C ";\n"
|
||||
#define VMADD_MII_IRi(DEST,A,B,C) "qvfxxnpmadd " #DEST "," #B "," #A ","#C ";\n"
|
||||
#define VMADD_II_MIRi(DEST,A,B,C) "qvfxxcpnmadd " #DEST "," #B "," #A ","#C ";\n"
|
||||
|
||||
#define VZERO(C) VZEROi(C)
|
||||
#define VONE(C) VONEi(C)
|
||||
#define VMOV(C,A) VMOVi(C,A)
|
||||
#define VADD(A,B,C) VADDi(A,B,C)
|
||||
#define VSUB(A,B,C) VSUBi(A,B,C)
|
||||
#define VMUL(A,B,C) VMULi(A,B,C)
|
||||
#define VMUL_RR_RI(A,B,C) VMUL_RR_RIi(A,B,C)
|
||||
#define VMADD(A,B,C,D) VMADDi(A,B,C,D)
|
||||
#define VMADD_RR_RI(A,B,C,D) VMADD_RR_RIi(A,B,C,D)
|
||||
#define VMADD_MII_IR(A,B,C,D) VMADD_MII_IRi(A,B,C,D)
|
||||
#define VMADD_II_MIR(A,B,C,D) VMADD_II_MIRi(A,B,C,D)
|
||||
|
||||
/*********************************************************
|
||||
* Macro sequences encoding QCD
|
||||
*********************************************************/
|
||||
#define LOCK_GAUGEa(dir)
|
||||
#define LOCK_GAUGE(dir) \
|
||||
{ \
|
||||
uint64_t byte_addr = (uint64_t)&U._odata[sU]; \
|
||||
int count = (sizeof(U._odata[0])+63)/64; \
|
||||
asm (" mtctr %0 \n" \
|
||||
" mr " HASH(REP) ", %1\n" \
|
||||
" li " HASH(IMM) ", 64\n" \
|
||||
"0:\n" \
|
||||
LOCK_SET \
|
||||
" add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \
|
||||
" bdnz 0b\n" \
|
||||
: : "b" (count), "b" (byte_addr) ); \
|
||||
}
|
||||
|
||||
#define UNLOCK_GAUGEa(dir)
|
||||
|
||||
#define UNLOCK_GAUGE(dir) \
|
||||
{ \
|
||||
uint64_t byte_addr = (uint64_t)&U._odata[sU]; \
|
||||
int count = (sizeof(U._odata[0])+63)/64; \
|
||||
asm (" mtctr %0 \n" \
|
||||
" mr " HASH(REP) ", %1\n" \
|
||||
" li " HASH(IMM) ", 64\n" \
|
||||
"0:\n" \
|
||||
LOCK_CLEAR \
|
||||
" add " HASH(REP) "," HASH(IMM) "," HASH(REP) "\n" \
|
||||
" bdnz 0b\n" \
|
||||
: : "b" (count), "b" (byte_addr) ); \
|
||||
}
|
||||
|
||||
#define MULT_2SPIN_QPX_LSd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,16)
|
||||
#define MULT_2SPIN_QPX_LSf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,8)
|
||||
#define MULT_2SPIN_QPXd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,32)
|
||||
#define MULT_2SPIN_QPXf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,16)
|
||||
|
||||
#define MULT_2SPIN_QPX_INTERNALa(ptr,p,ULOAD,USKIP) { \
|
||||
asm (VMOV(UChi_00,Chi_00) \
|
||||
VMOV(UChi_01,Chi_01) \
|
||||
VMOV(UChi_02,Chi_02) \
|
||||
VMOV(UChi_10,Chi_10) \
|
||||
VMOV(UChi_11,Chi_11) \
|
||||
VMOV(UChi_12,Chi_12) ); \
|
||||
}
|
||||
|
||||
#define MULT_2SPIN_QPX_INTERNAL(ptr,p,ULOAD,USKIP) { \
|
||||
uint64_t ub = ((uint64_t)ptr); \
|
||||
asm ( \
|
||||
ULOAD(%0,%3,U0) \
|
||||
ULOAD(%1,%3,U1) \
|
||||
ULOAD(%2,%3,U2) \
|
||||
VMUL_RR_RI(UChi_00,U0,Chi_00) \
|
||||
VMUL_RR_RI(UChi_01,U1,Chi_00) \
|
||||
VMUL_RR_RI(UChi_02,U2,Chi_00) \
|
||||
VMUL_RR_RI(UChi_10,U0,Chi_10) \
|
||||
VMUL_RR_RI(UChi_11,U1,Chi_10) \
|
||||
VMUL_RR_RI(UChi_12,U2,Chi_10) \
|
||||
VMADD_MII_IR(UChi_00,U0,Chi_00,UChi_00) \
|
||||
VMADD_MII_IR(UChi_01,U1,Chi_00,UChi_01) \
|
||||
VMADD_MII_IR(UChi_02,U2,Chi_00,UChi_02) \
|
||||
VMADD_MII_IR(UChi_10,U0,Chi_10,UChi_10) \
|
||||
VMADD_MII_IR(UChi_11,U1,Chi_10,UChi_11) \
|
||||
VMADD_MII_IR(UChi_12,U2,Chi_10,UChi_12) \
|
||||
: : "b" (0), "b" (USKIP*3), "b" (USKIP*6), "b" (ub )); \
|
||||
asm ( \
|
||||
ULOAD(%0,%3,U0) \
|
||||
ULOAD(%1,%3,U1) \
|
||||
ULOAD(%2,%3,U2) \
|
||||
VMADD_RR_RI(UChi_00,U0,Chi_01,UChi_00) \
|
||||
VMADD_RR_RI(UChi_01,U1,Chi_01,UChi_01) \
|
||||
VMADD_RR_RI(UChi_02,U2,Chi_01,UChi_02) \
|
||||
VMADD_RR_RI(UChi_10,U0,Chi_11,UChi_10) \
|
||||
VMADD_RR_RI(UChi_11,U1,Chi_11,UChi_11) \
|
||||
VMADD_RR_RI(UChi_12,U2,Chi_11,UChi_12) \
|
||||
VMADD_MII_IR(UChi_00,U0,Chi_01,UChi_00) \
|
||||
VMADD_MII_IR(UChi_01,U1,Chi_01,UChi_01) \
|
||||
VMADD_MII_IR(UChi_02,U2,Chi_01,UChi_02) \
|
||||
VMADD_MII_IR(UChi_10,U0,Chi_11,UChi_10) \
|
||||
VMADD_MII_IR(UChi_11,U1,Chi_11,UChi_11) \
|
||||
VMADD_MII_IR(UChi_12,U2,Chi_11,UChi_12) \
|
||||
: : "b" (USKIP*1), "b" (USKIP*4), "b" (USKIP*7), "b" (ub )); \
|
||||
asm ( \
|
||||
ULOAD(%0,%3,U0) \
|
||||
ULOAD(%1,%3,U1) \
|
||||
ULOAD(%2,%3,U2) \
|
||||
VMADD_RR_RI(UChi_00,U0,Chi_02,UChi_00) \
|
||||
VMADD_RR_RI(UChi_01,U1,Chi_02,UChi_01) \
|
||||
VMADD_RR_RI(UChi_02,U2,Chi_02,UChi_02) \
|
||||
VMADD_RR_RI(UChi_10,U0,Chi_12,UChi_10) \
|
||||
VMADD_RR_RI(UChi_11,U1,Chi_12,UChi_11) \
|
||||
VMADD_RR_RI(UChi_12,U2,Chi_12,UChi_12) \
|
||||
VMADD_MII_IR(UChi_00,U0,Chi_02,UChi_00) \
|
||||
VMADD_MII_IR(UChi_01,U1,Chi_02,UChi_01) \
|
||||
VMADD_MII_IR(UChi_02,U2,Chi_02,UChi_02) \
|
||||
VMADD_MII_IR(UChi_10,U0,Chi_12,UChi_10) \
|
||||
VMADD_MII_IR(UChi_11,U1,Chi_12,UChi_11) \
|
||||
VMADD_MII_IR(UChi_12,U2,Chi_12,UChi_12) \
|
||||
: : "b" (USKIP*2), "b" (USKIP*5), "b" (USKIP*8), "b" (ub )); \
|
||||
}
|
||||
|
||||
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
|
||||
|
||||
#define SAVE_RESULT(base,basep) {\
|
||||
uint64_t ub = ((uint64_t)base) - (VSIZE); \
|
||||
asm("mr " HASH(REP) ", %0;\n" \
|
||||
"li " HASH(IMM) "," HASH(VSIZE)" ;\n" \
|
||||
VSTOREu(IMM,REP,psi_00) \
|
||||
VSTOREu(IMM,REP,psi_01) \
|
||||
VSTOREu(IMM,REP,psi_02) \
|
||||
VSTOREu(IMM,REP,psi_10) \
|
||||
VSTOREu(IMM,REP,psi_11) \
|
||||
VSTOREu(IMM,REP,psi_12) \
|
||||
VSTOREu(IMM,REP,psi_20) \
|
||||
VSTOREu(IMM,REP,psi_21) \
|
||||
VSTOREu(IMM,REP,psi_22) \
|
||||
VSTOREu(IMM,REP,psi_30) \
|
||||
VSTOREu(IMM,REP,psi_31) \
|
||||
VSTOREu(IMM,REP,psi_32) \
|
||||
: : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
}
|
||||
|
||||
/*
|
||||
*Annoying BG/Q loads with no immediat indexing and big performance hit
|
||||
*when second miss to a L1 line occurs
|
||||
*/
|
||||
#define LOAD_CHI(base) { \
|
||||
uint64_t ub = ((uint64_t)base) - (2*VSIZE); \
|
||||
asm("mr " HASH(REP) ",%0 ;\n" \
|
||||
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
|
||||
VLOADu(IMM,REP,Chi_00) \
|
||||
VLOADu(IMM,REP,Chi_02) \
|
||||
VLOADu(IMM,REP,Chi_11) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
ub = ((uint64_t)base) - VSIZE; \
|
||||
asm("mr " HASH(REP) ", %0;\n" \
|
||||
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
|
||||
VLOADu(IMM,REP,Chi_01) \
|
||||
VLOADu(IMM,REP,Chi_10) \
|
||||
VLOADu(IMM,REP,Chi_12) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
}
|
||||
|
||||
#define LOAD_CHIa(base) { \
|
||||
uint64_t ub = ((uint64_t)base) - (VSIZE); \
|
||||
asm("mr " HASH(REP) ",%0 ;\n" \
|
||||
"li " HASH(IMM) "," HASH(VSIZE) ";\n" \
|
||||
VLOADu(IMM,REP,Chi_00) \
|
||||
VLOADu(IMM,REP,Chi_01) \
|
||||
VLOADu(IMM,REP,Chi_02) \
|
||||
VLOADu(IMM,REP,Chi_10) \
|
||||
VLOADu(IMM,REP,Chi_11) \
|
||||
VLOADu(IMM,REP,Chi_12) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
}
|
||||
|
||||
#define LOAD_CHIMUa(base) { \
|
||||
uint64_t ub = ((uint64_t)base) - (VSIZE); \
|
||||
asm("mr " HASH(REP) ",%0 ;\n" \
|
||||
"li " HASH(IMM) "," HASH(VSIZE) ";\n" \
|
||||
VLOADu(IMM,REP,Chi_00) \
|
||||
VLOADu(IMM,REP,Chi_01) \
|
||||
VLOADu(IMM,REP,Chi_02) \
|
||||
VLOADu(IMM,REP,Chi_10) \
|
||||
VLOADu(IMM,REP,Chi_11) \
|
||||
VLOADu(IMM,REP,Chi_12) \
|
||||
VLOADu(IMM,REP,Chi_20) \
|
||||
VLOADu(IMM,REP,Chi_21) \
|
||||
VLOADu(IMM,REP,Chi_22) \
|
||||
VLOADu(IMM,REP,Chi_30) \
|
||||
VLOADu(IMM,REP,Chi_31) \
|
||||
VLOADu(IMM,REP,Chi_32) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
}
|
||||
|
||||
#define LOAD_CHIMU(base) { \
|
||||
uint64_t ub = ((uint64_t)base) - (2*VSIZE); \
|
||||
asm("mr " HASH(REP) ",%0;\n" \
|
||||
"li " HASH(IMM) ",(2*" HASH(VSIZE) ");\n" \
|
||||
VLOADu(IMM,REP,Chi_00) \
|
||||
VLOADu(IMM,REP,Chi_02) \
|
||||
VLOADu(IMM,REP,Chi_11) \
|
||||
VLOADu(IMM,REP,Chi_20) \
|
||||
VLOADu(IMM,REP,Chi_22) \
|
||||
VLOADu(IMM,REP,Chi_31) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
ub = ((uint64_t)base) - VSIZE; \
|
||||
asm("mr " HASH(REP) ", %0;\n" \
|
||||
"li " HASH(IMM) ", (2*" HASH(VSIZE) ");\n" \
|
||||
VLOADu(IMM,REP,Chi_01) \
|
||||
VLOADu(IMM,REP,Chi_10) \
|
||||
VLOADu(IMM,REP,Chi_12) \
|
||||
VLOADu(IMM,REP,Chi_21) \
|
||||
VLOADu(IMM,REP,Chi_30) \
|
||||
VLOADu(IMM,REP,Chi_32) : : "b" (ub) : HASH(pIMM), HASH(pREP) ); \
|
||||
}
|
||||
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
#define XP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_MII_IR(Chi_00,one,Chi_30,Chi_00) \
|
||||
VMADD_MII_IR(Chi_01,one,Chi_31,Chi_01) \
|
||||
VMADD_MII_IR(Chi_02,one,Chi_32,Chi_02) \
|
||||
VMADD_MII_IR(Chi_10,one,Chi_20,Chi_10) \
|
||||
VMADD_MII_IR(Chi_11,one,Chi_21,Chi_11) \
|
||||
VMADD_MII_IR(Chi_12,one,Chi_22,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_II_MIR(Chi_00,one,Chi_30,Chi_00) \
|
||||
VMADD_II_MIR(Chi_01,one,Chi_31,Chi_01) \
|
||||
VMADD_II_MIR(Chi_02,one,Chi_32,Chi_02) \
|
||||
VMADD_II_MIR(Chi_10,one,Chi_20,Chi_10) \
|
||||
VMADD_II_MIR(Chi_11,one,Chi_21,Chi_11) \
|
||||
VMADD_II_MIR(Chi_12,one,Chi_22,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
// hspin(0)=fspin(0)-fspin(3);
|
||||
// hspin(1)=fspin(1)+fspin(2);
|
||||
#define YP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VSUB(Chi_00,Chi_00,Chi_30) \
|
||||
VSUB(Chi_01,Chi_01,Chi_31) \
|
||||
VSUB(Chi_02,Chi_02,Chi_32) \
|
||||
VADD(Chi_10,Chi_10,Chi_20) \
|
||||
VADD(Chi_11,Chi_11,Chi_21) \
|
||||
VADD(Chi_12,Chi_12,Chi_22) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define YM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VADD(Chi_00,Chi_00,Chi_30) \
|
||||
VADD(Chi_01,Chi_01,Chi_31) \
|
||||
VADD(Chi_02,Chi_02,Chi_32) \
|
||||
VSUB(Chi_10,Chi_10,Chi_20) \
|
||||
VSUB(Chi_11,Chi_11,Chi_21) \
|
||||
VSUB(Chi_12,Chi_12,Chi_22) ); \
|
||||
}
|
||||
|
||||
/*Gz
|
||||
* 0 0 i 0 [0]+-i[2]
|
||||
* 0 0 0 -i [1]-+i[3]
|
||||
* -i 0 0 0
|
||||
* 0 i 0 0
|
||||
*/
|
||||
#define ZP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_MII_IR(Chi_00,one,Chi_20,Chi_00) \
|
||||
VMADD_MII_IR(Chi_01,one,Chi_21,Chi_01) \
|
||||
VMADD_MII_IR(Chi_02,one,Chi_22,Chi_02) \
|
||||
VMADD_II_MIR(Chi_10,one,Chi_30,Chi_10) \
|
||||
VMADD_II_MIR(Chi_11,one,Chi_31,Chi_11) \
|
||||
VMADD_II_MIR(Chi_12,one,Chi_32,Chi_12) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define ZM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VONE(one) \
|
||||
VMADD_II_MIR(Chi_00,one,Chi_20,Chi_00) \
|
||||
VMADD_II_MIR(Chi_01,one,Chi_21,Chi_01) \
|
||||
VMADD_II_MIR(Chi_02,one,Chi_22,Chi_02) \
|
||||
VMADD_MII_IR(Chi_10,one,Chi_30,Chi_10) \
|
||||
VMADD_MII_IR(Chi_11,one,Chi_31,Chi_11) \
|
||||
VMADD_MII_IR(Chi_12,one,Chi_32,Chi_12) \
|
||||
); \
|
||||
}
|
||||
/*Gt
|
||||
* 0 0 1 0 [0]+-[2]
|
||||
* 0 0 0 1 [1]+-[3]
|
||||
* 1 0 0 0
|
||||
* 0 1 0 0
|
||||
*/
|
||||
#define TP_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VADD(Chi_00,Chi_00,Chi_20) \
|
||||
VADD(Chi_01,Chi_01,Chi_21) \
|
||||
VADD(Chi_02,Chi_02,Chi_22) \
|
||||
VADD(Chi_10,Chi_10,Chi_30) \
|
||||
VADD(Chi_11,Chi_11,Chi_31) \
|
||||
VADD(Chi_12,Chi_12,Chi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define TM_PROJMEM(base) { \
|
||||
LOAD_CHIMU(base); \
|
||||
asm ( \
|
||||
VSUB(Chi_00,Chi_00,Chi_20) \
|
||||
VSUB(Chi_01,Chi_01,Chi_21) \
|
||||
VSUB(Chi_02,Chi_02,Chi_22) \
|
||||
VSUB(Chi_10,Chi_10,Chi_30) \
|
||||
VSUB(Chi_11,Chi_11,Chi_31) \
|
||||
VSUB(Chi_12,Chi_12,Chi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
/*
|
||||
fspin(0)=hspin(0);
|
||||
fspin(1)=hspin(1);
|
||||
fspin(2)=timesMinusI(hspin(1));
|
||||
fspin(3)=timesMinusI(hspin(0));
|
||||
|
||||
fspin(0)+=hspin(0);
|
||||
fspin(1)+=hspin(1);
|
||||
fspin(2)-=timesI(hspin(1));
|
||||
fspin(3)-=timesI(hspin(0));
|
||||
*/
|
||||
#define XP_RECON { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
|
||||
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
|
||||
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
|
||||
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
|
||||
VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \
|
||||
VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \
|
||||
VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \
|
||||
VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \
|
||||
VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \
|
||||
VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_RECON { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\
|
||||
VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\
|
||||
VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \
|
||||
VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \
|
||||
VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \
|
||||
VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \
|
||||
VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \
|
||||
VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \
|
||||
VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \
|
||||
VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XP_RECON_ACCUM { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VMADD_II_MIR(psi_20,one,UChi_10,psi_20) \
|
||||
VMADD_II_MIR(psi_21,one,UChi_11,psi_21) \
|
||||
VMADD_II_MIR(psi_22,one,UChi_12,psi_22) \
|
||||
VMADD_II_MIR(psi_30,one,UChi_00,psi_30) \
|
||||
VMADD_II_MIR(psi_31,one,UChi_01,psi_31) \
|
||||
VMADD_II_MIR(psi_32,one,UChi_02,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
#define XM_RECON_ACCUM { \
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VMADD_MII_IR(psi_20,one,UChi_10,psi_20) \
|
||||
VMADD_MII_IR(psi_21,one,UChi_11,psi_21) \
|
||||
VMADD_MII_IR(psi_22,one,UChi_12,psi_22) \
|
||||
VMADD_MII_IR(psi_30,one,UChi_00,psi_30) \
|
||||
VMADD_MII_IR(psi_31,one,UChi_01,psi_31) \
|
||||
VMADD_MII_IR(psi_32,one,UChi_02,psi_32) \
|
||||
); \
|
||||
}
|
||||
|
||||
// fspin(2)+=hspin(1);
|
||||
// fspin(3)-=hspin(0);
|
||||
#define YP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VADD(psi_20,psi_20,UChi_10) VADD(psi_21,psi_21,UChi_11) VADD(psi_22,psi_22,UChi_12) \
|
||||
VSUB(psi_30,psi_30,UChi_00) VSUB(psi_31,psi_31,UChi_01) VSUB(psi_32,psi_32,UChi_02) \
|
||||
);\
|
||||
}
|
||||
#define YM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VSUB(psi_20,psi_20,UChi_10) VSUB(psi_21,psi_21,UChi_11) VSUB(psi_22,psi_22,UChi_12) \
|
||||
VADD(psi_30,psi_30,UChi_00) VADD(psi_31,psi_31,UChi_01) VADD(psi_32,psi_32,UChi_02) \
|
||||
);\
|
||||
}
|
||||
|
||||
// fspin(2)-=timesI(hspin(0));
|
||||
// fspin(3)+=timesI(hspin(1));
|
||||
#define ZP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VMADD_II_MIR(psi_20,one,UChi_00,psi_20) \
|
||||
VMADD_II_MIR(psi_21,one,UChi_01,psi_21) \
|
||||
VMADD_II_MIR(psi_22,one,UChi_02,psi_22) \
|
||||
VMADD_MII_IR(psi_30,one,UChi_10,psi_30) \
|
||||
VMADD_MII_IR(psi_31,one,UChi_11,psi_31) \
|
||||
VMADD_MII_IR(psi_32,one,UChi_12,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
#define ZM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VONE(one)\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VMADD_MII_IR(psi_20,one,UChi_00,psi_20) \
|
||||
VMADD_MII_IR(psi_21,one,UChi_01,psi_21) \
|
||||
VMADD_MII_IR(psi_22,one,UChi_02,psi_22) \
|
||||
VMADD_II_MIR(psi_30,one,UChi_10,psi_30) \
|
||||
VMADD_II_MIR(psi_31,one,UChi_11,psi_31) \
|
||||
VMADD_II_MIR(psi_32,one,UChi_12,psi_32) \
|
||||
);\
|
||||
}
|
||||
|
||||
// fspin(2)+=hspin(0);
|
||||
// fspin(3)+=hspin(1);
|
||||
#define TP_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VADD(psi_20,psi_20,UChi_00) VADD(psi_21,psi_21,UChi_01) VADD(psi_22,psi_22,UChi_02) \
|
||||
VADD(psi_30,psi_30,UChi_10) VADD(psi_31,psi_31,UChi_11) VADD(psi_32,psi_32,UChi_12) \
|
||||
);\
|
||||
}
|
||||
|
||||
#define TM_RECON_ACCUM {\
|
||||
asm(\
|
||||
VADD(psi_00,psi_00,UChi_00) VADD(psi_01,psi_01,UChi_01) VADD(psi_02,psi_02,UChi_02) \
|
||||
VADD(psi_10,psi_10,UChi_10) VADD(psi_11,psi_11,UChi_11) VADD(psi_12,psi_12,UChi_12) \
|
||||
VSUB(psi_20,psi_20,UChi_00) VSUB(psi_21,psi_21,UChi_01) VSUB(psi_22,psi_22,UChi_02) \
|
||||
VSUB(psi_30,psi_30,UChi_10) VSUB(psi_31,psi_31,UChi_11) VSUB(psi_32,psi_32,UChi_12) \
|
||||
);\
|
||||
}
|
||||
|
||||
#define PERMUTE_DIR3
|
||||
#define PERMUTE_DIR2
|
||||
#define PERMUTE_DIR1
|
||||
|
||||
#define PERMUTE_DIR0 { \
|
||||
asm( \
|
||||
VPERMI(perm_reg) \
|
||||
VPERM(Chi_00,perm_reg) VPERM(Chi_01,perm_reg) VPERM(Chi_02,perm_reg) \
|
||||
VPERM(Chi_10,perm_reg) VPERM(Chi_11,perm_reg) VPERM(Chi_12,perm_reg) ); \
|
||||
}
|
||||
|
||||
#endif
|
46
lib/simd/IBM_qpx_double.h
Normal file
46
lib/simd/IBM_qpx_double.h
Normal file
@ -0,0 +1,46 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
// No guard; ok multi-include
|
||||
#undef VSIZE
|
||||
#undef VLOAD
|
||||
#undef VLOADu
|
||||
#undef VSPLAT
|
||||
#undef VSTORE
|
||||
#undef VSTOREu
|
||||
#undef MULT_2SPIN_QPX_LS
|
||||
#undef MULT_2SPIN_QPX
|
||||
|
||||
#define VSIZE VSIZEd
|
||||
#define VLOAD(A,B,C) VLOADd(A,B,C)
|
||||
#define VLOADu(A,B,C) VLOADud(A,B,C)
|
||||
#define VSPLAT(A,B,DEST) VSPLATd(A,B,DEST)
|
||||
#define VSTORE(A,B,C) VSTOREd(A,B,C)
|
||||
#define VSTOREu(A,B,C) VSTOREud(A,B,C)
|
||||
#define MULT_2SPIN_QPX_LS(ptr,p) MULT_2SPIN_QPX_LSd(ptr,p)
|
||||
#define MULT_2SPIN_QPX(ptr,p) MULT_2SPIN_QPXd(ptr,p)
|
||||
|
46
lib/simd/IBM_qpx_single.h
Normal file
46
lib/simd/IBM_qpx_single.h
Normal file
@ -0,0 +1,46 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
// No guard; ok multi-include
|
||||
#undef VSIZE
|
||||
#undef VLOAD
|
||||
#undef VLOADu
|
||||
#undef VSPLAT
|
||||
#undef VSTORE
|
||||
#undef VSTOREu
|
||||
#undef MULT_2SPIN_QPX_LS
|
||||
#undef MULT_2SPIN_QPX
|
||||
|
||||
#define VSIZE VSIZEf
|
||||
#define VLOAD(A,B,C) VLOADf(A,B,C)
|
||||
#define VLOADu(A,B,C) VLOADuf(A,B,C)
|
||||
#define VSPLAT(A,B,DEST) VSPLATf(A,B,DEST)
|
||||
#define VSTORE(A,B,C) VSTOREf(A,B,C)
|
||||
#define VSTOREu(A,B,C) VSTOREuf(A,B,C)
|
||||
#define MULT_2SPIN_QPX_LS(ptr,p) MULT_2SPIN_QPX_LSf(ptr,p)
|
||||
#define MULT_2SPIN_QPX(ptr,p) MULT_2SPIN_QPXf(ptr,p)
|
||||
|
Loading…
x
Reference in New Issue
Block a user