mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Cleaning up the single/double kernel implementation switch
This commit is contained in:
parent
60d965f79e
commit
8052556275
@ -32,81 +32,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#include <simd/Intel512wilson.h>
|
#include <simd/Intel512wilson.h>
|
||||||
|
|
||||||
#undef VLOAD
|
#include <simd/Intel512single.h>
|
||||||
#undef VSTORE
|
|
||||||
#undef VMUL
|
|
||||||
#undef VMADD
|
|
||||||
#undef ZEND
|
|
||||||
#undef ZLOAD
|
|
||||||
#undef ZMUL
|
|
||||||
#undef ZMADD
|
|
||||||
#undef VZERO
|
|
||||||
#undef VTIMESI
|
|
||||||
#undef VTIMESMINUSI
|
|
||||||
#undef VMOVIDUP
|
|
||||||
#undef VMOVRDUP
|
|
||||||
#undef VMADDSUB
|
|
||||||
#undef VSHUF
|
|
||||||
|
|
||||||
#define VZERO(A) VZEROf(A)
|
|
||||||
#define VMOV(A,B) VMOVf(A,B)
|
|
||||||
#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST)
|
|
||||||
#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC)
|
|
||||||
|
|
||||||
#define VADD(A,B,C) VADDf(A,B,C)
|
|
||||||
#define VSUB(A,B,C) VSUBf(A,B,C)
|
|
||||||
#define VMUL(Uri,Uir,Chi) VMULf(Uri,Uir,Chi)
|
|
||||||
#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
|
|
||||||
|
|
||||||
#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
|
|
||||||
#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
|
|
||||||
#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C)
|
|
||||||
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C)
|
|
||||||
|
|
||||||
#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C)
|
|
||||||
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C)
|
|
||||||
#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C)
|
|
||||||
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C)
|
|
||||||
|
|
||||||
#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C)
|
|
||||||
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C)
|
|
||||||
#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C)
|
|
||||||
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C)
|
|
||||||
|
|
||||||
#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C)
|
|
||||||
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C)
|
|
||||||
#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C)
|
|
||||||
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C)
|
|
||||||
|
|
||||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
|
|
||||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
|
|
||||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
|
|
||||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
|
|
||||||
|
|
||||||
#define VPERM0(A,B) VPERM0f(A,B)
|
|
||||||
#define VPERM1(A,B) VPERM1f(A,B)
|
|
||||||
#define VPERM2(A,B) VPERM2f(A,B)
|
|
||||||
#define VPERM3(A,B) VPERM3f(A,B)
|
|
||||||
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST)
|
|
||||||
|
|
||||||
#define ZEND1(A,B,C) ZEND1f(A,B,C)
|
|
||||||
#define ZEND2(A,B,C) ZEND2f(A,B,C)
|
|
||||||
#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D)
|
|
||||||
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
|
|
||||||
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
|
|
||||||
|
|
||||||
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
|
|
||||||
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
|
|
||||||
|
|
||||||
#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C)
|
|
||||||
#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C)
|
|
||||||
#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
|
|
||||||
#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
|
|
||||||
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum)
|
|
||||||
#define VSHUF(A,B) VSHUFf(A,B)
|
|
||||||
|
|
||||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
|
||||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
@ -136,26 +63,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
|||||||
|
|
||||||
SE=st.GetEntry(ptype,Xm,ss);
|
SE=st.GetEntry(ptype,Xm,ss);
|
||||||
|
|
||||||
#if 0
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
LOAD64(%r9,pf);
|
|
||||||
__asm__(
|
|
||||||
VPREFETCH(0,%r9)
|
|
||||||
VPREFETCH(1,%r9)
|
|
||||||
VPREFETCH(2,%r9)
|
|
||||||
VPREFETCH(3,%r9)
|
|
||||||
VPREFETCH(4,%r9)
|
|
||||||
VPREFETCH(5,%r9)
|
|
||||||
VPREFETCH(6,%r9)
|
|
||||||
VPREFETCH(7,%r9)
|
|
||||||
VPREFETCH(8,%r9)
|
|
||||||
VPREFETCH(9,%r9)
|
|
||||||
VPREFETCH(10,%r9)
|
|
||||||
VPREFETCH(11,%r9) );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Xm
|
// Xm
|
||||||
offset = SE->_offset;
|
offset = SE->_offset;
|
||||||
local = SE->_is_local;
|
local = SE->_is_local;
|
||||||
@ -322,8 +229,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
|||||||
offset = SE->_offset;
|
offset = SE->_offset;
|
||||||
local = SE->_is_local;
|
local = SE->_is_local;
|
||||||
|
|
||||||
// PREFETCH_R(A);
|
|
||||||
|
|
||||||
// Prefetch
|
// Prefetch
|
||||||
SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
|
SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
||||||
|
135
lib/simd/Intel512double.h
Normal file
135
lib/simd/Intel512double.h
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/simd/Avx512Asm.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
// No guard can be multiply included as undef clearage
|
||||||
|
#undef VZERO
|
||||||
|
#undef VMOV
|
||||||
|
#undef VLOAD
|
||||||
|
#undef VSTORE
|
||||||
|
#define VZERO(A) VZEROd(A)
|
||||||
|
#define VMOV(A,B) VMOVd(A,B)
|
||||||
|
#define VLOAD(OFF,PTR,DEST) VLOADd(OFF,PTR,DEST)
|
||||||
|
#define VSTORE(OFF,PTR,SRC) VSTOREd(OFF,PTR,SRC)
|
||||||
|
|
||||||
|
#undef VADD
|
||||||
|
#undef VSUB
|
||||||
|
#undef VMUL
|
||||||
|
#undef VMADD
|
||||||
|
#define VADD(A,B,C) VADDd(A,B,C)
|
||||||
|
#define VSUB(A,B,C) VSUBd(A,B,C)
|
||||||
|
#define VMUL(Uri,Uir,Chi) VMULd(Uri,Uir,Chi)
|
||||||
|
#define VMADD(Uri,Uir,Chi) VMADDd(Uri,Uir,Chi)
|
||||||
|
|
||||||
|
|
||||||
|
#undef VTIMESI
|
||||||
|
#undef VTIMESI0
|
||||||
|
#undef VTIMESI1
|
||||||
|
#undef VTIMESI2
|
||||||
|
#define VTIMESI(A,B,C) VTIMESId(A,B,C)
|
||||||
|
#define VTIMESI0(A,B,C) VTIMESI0d(A,B,C)
|
||||||
|
#define VTIMESI1(A,B,C) VTIMESI1d(A,B,C)
|
||||||
|
#define VTIMESI2(A,B,C) VTIMESI2d(A,B,C)
|
||||||
|
|
||||||
|
#undef VTIMESMINUSI
|
||||||
|
#undef VTIMESMINUSI0
|
||||||
|
#undef VTIMESMINUSI1
|
||||||
|
#undef VTIMESMINUSI2
|
||||||
|
#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C)
|
||||||
|
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0d(A,B,C)
|
||||||
|
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1d(A,B,C)
|
||||||
|
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2d(A,B,C)
|
||||||
|
|
||||||
|
#undef VACCTIMESI
|
||||||
|
#undef VACCTIMESI0
|
||||||
|
#undef VACCTIMESI1
|
||||||
|
#undef VACCTIMESI2
|
||||||
|
#define VACCTIMESI(A,B,C) VACCTIMESId(A,B,C)
|
||||||
|
#define VACCTIMESI0(A,B,C) VACCTIMESI0d(A,B,C)
|
||||||
|
#define VACCTIMESI1(A,B,C) VACCTIMESI1d(A,B,C)
|
||||||
|
#define VACCTIMESI2(A,B,C) VACCTIMESI2d(A,B,C)
|
||||||
|
|
||||||
|
#undef VACCTIMESMINUSI
|
||||||
|
#undef VACCTIMESMINUSI0
|
||||||
|
#undef VACCTIMESMINUSI1
|
||||||
|
#undef VACCTIMESMINUSI2
|
||||||
|
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSId(A,B,C)
|
||||||
|
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0d(A,B,C)
|
||||||
|
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1d(A,B,C)
|
||||||
|
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2d(A,B,C)
|
||||||
|
|
||||||
|
#undef VACCTIMESI1MEM
|
||||||
|
#undef VACCTIMESI2MEM
|
||||||
|
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMd(A,ACC,O,P)
|
||||||
|
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMd(A,ACC,O,P)
|
||||||
|
|
||||||
|
#undef VACCTIMESMINUSI1MEM
|
||||||
|
#undef VACCTIMESMINUSI2MEM
|
||||||
|
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
|
||||||
|
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
|
||||||
|
|
||||||
|
#undef VPERM0
|
||||||
|
#undef VPERM1
|
||||||
|
#undef VPERM2
|
||||||
|
#undef VPERM3
|
||||||
|
#define VPERM0(A,B) VPERM0d(A,B)
|
||||||
|
#define VPERM1(A,B) VPERM1d(A,B)
|
||||||
|
#define VPERM2(A,B) VPERM2d(A,B)
|
||||||
|
#define VPERM3(A,B) VPERM3d(A,B)
|
||||||
|
|
||||||
|
#undef VSHUFMEM
|
||||||
|
#undef VADDMEM
|
||||||
|
#undef VSUBMEM
|
||||||
|
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMd(OFF,A,DEST)
|
||||||
|
#define VADDMEM(O,A,B,C) VADDMEMd(O,A,B,C)
|
||||||
|
#define VSUBMEM(O,A,B,C) VSUBMEMd(O,A,B,C)
|
||||||
|
|
||||||
|
#undef VMOVIDUP
|
||||||
|
#undef VMOVRDUP
|
||||||
|
#undef VMADDSBUB
|
||||||
|
#undef VSHUF
|
||||||
|
#define VMOVIDUP(A,B,C) VMOVIDUPd(A,B,C)
|
||||||
|
#define VMOVRDUP(A,B,C) VMOVRDUPd(A,B,C)
|
||||||
|
#define VMADDSUB(A,B,accum) VMADDSUBd(A,B,accum)
|
||||||
|
#define VSHUD(A,B) VSHUFd(A,B)
|
||||||
|
|
||||||
|
|
||||||
|
#undef ZEND1
|
||||||
|
#undef ZEND2
|
||||||
|
#undef ZLOAD
|
||||||
|
#undef ZMUL
|
||||||
|
#undef ZMADD
|
||||||
|
#undef ZMULMEM2SP
|
||||||
|
#undef ZMADDMEM2SP
|
||||||
|
|
||||||
|
#define ZEND1(A,B,C) ZEND1d(A,B,C)
|
||||||
|
#define ZEND2(A,B,C) ZEND2d(A,B,C)
|
||||||
|
#define ZLOAD(A,B,C,D) ZLOADd(A,B,C,D)
|
||||||
|
#define ZMUL(A,B,C,D,E) ZMULd(A,B,C,D,E)
|
||||||
|
#define ZMADD(A,B,C,D,E) ZMADDd(A,B,C,D,E)
|
||||||
|
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||||
|
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||||
|
|
135
lib/simd/Intel512single.h
Normal file
135
lib/simd/Intel512single.h
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/simd/Avx512Asm.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
// No guard can be multiply included as undef clearge of macros
|
||||||
|
#undef VZERO
|
||||||
|
#undef VMOV
|
||||||
|
#undef VLOAD
|
||||||
|
#undef VSTORE
|
||||||
|
#define VZERO(A) VZEROf(A)
|
||||||
|
#define VMOV(A,B) VMOVf(A,B)
|
||||||
|
#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST)
|
||||||
|
#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC)
|
||||||
|
|
||||||
|
#undef VADD
|
||||||
|
#undef VSUB
|
||||||
|
#undef VMUL
|
||||||
|
#undef VMADD
|
||||||
|
#define VADD(A,B,C) VADDf(A,B,C)
|
||||||
|
#define VSUB(A,B,C) VSUBf(A,B,C)
|
||||||
|
#define VMUL(Uri,Uir,Chi) VMULf(Uri,Uir,Chi)
|
||||||
|
#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
|
||||||
|
|
||||||
|
|
||||||
|
#undef VTIMESI
|
||||||
|
#undef VTIMESI0
|
||||||
|
#undef VTIMESI1
|
||||||
|
#undef VTIMESI2
|
||||||
|
#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
|
||||||
|
#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C)
|
||||||
|
#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C)
|
||||||
|
#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C)
|
||||||
|
|
||||||
|
#undef VTIMESMINUSI
|
||||||
|
#undef VTIMESMINUSI0
|
||||||
|
#undef VTIMESMINUSI1
|
||||||
|
#undef VTIMESMINUSI2
|
||||||
|
#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
|
||||||
|
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C)
|
||||||
|
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C)
|
||||||
|
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C)
|
||||||
|
|
||||||
|
#undef VACCTIMESI
|
||||||
|
#undef VACCTIMESI0
|
||||||
|
#undef VACCTIMESI1
|
||||||
|
#undef VACCTIMESI2
|
||||||
|
#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C)
|
||||||
|
#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C)
|
||||||
|
#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C)
|
||||||
|
#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C)
|
||||||
|
|
||||||
|
#undef VACCTIMESMINUSI
|
||||||
|
#undef VACCTIMESMINUSI0
|
||||||
|
#undef VACCTIMESMINUSI1
|
||||||
|
#undef VACCTIMESMINUSI2
|
||||||
|
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C)
|
||||||
|
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C)
|
||||||
|
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C)
|
||||||
|
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C)
|
||||||
|
|
||||||
|
#undef VACCTIMESI1MEM
|
||||||
|
#undef VACCTIMESI2MEM
|
||||||
|
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
|
||||||
|
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
|
||||||
|
|
||||||
|
#undef VACCTIMESMINUSI1MEM
|
||||||
|
#undef VACCTIMESMINUSI2MEM
|
||||||
|
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
|
||||||
|
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
|
||||||
|
|
||||||
|
#undef VPERM0
|
||||||
|
#undef VPERM1
|
||||||
|
#undef VPERM2
|
||||||
|
#undef VPERM3
|
||||||
|
#define VPERM0(A,B) VPERM0f(A,B)
|
||||||
|
#define VPERM1(A,B) VPERM1f(A,B)
|
||||||
|
#define VPERM2(A,B) VPERM2f(A,B)
|
||||||
|
#define VPERM3(A,B) VPERM3f(A,B)
|
||||||
|
|
||||||
|
#undef VSHUFMEM
|
||||||
|
#undef VADDMEM
|
||||||
|
#undef VSUBMEM
|
||||||
|
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST)
|
||||||
|
#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C)
|
||||||
|
#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C)
|
||||||
|
|
||||||
|
#undef VMOVIDUP
|
||||||
|
#undef VMOVRDUP
|
||||||
|
#undef VMADDSBUB
|
||||||
|
#undef VSHUF
|
||||||
|
#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
|
||||||
|
#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
|
||||||
|
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum)
|
||||||
|
#define VSHUF(A,B) VSHUFf(A,B)
|
||||||
|
|
||||||
|
|
||||||
|
#undef ZEND1
|
||||||
|
#undef ZEND2
|
||||||
|
#undef ZLOAD
|
||||||
|
#undef ZMUL
|
||||||
|
#undef ZMADD
|
||||||
|
#undef ZMULMEM2SP
|
||||||
|
#undef ZMADDMEM2SP
|
||||||
|
|
||||||
|
#define ZEND1(A,B,C) ZEND1f(A,B,C)
|
||||||
|
#define ZEND2(A,B,C) ZEND2f(A,B,C)
|
||||||
|
#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D)
|
||||||
|
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
|
||||||
|
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
|
||||||
|
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||||
|
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||||
|
|
@ -201,7 +201,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
// Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
|
// Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
|
||||||
// KNL is DUAL issue for FP, and lifting these loads is potentially important.
|
// KNL is DUAL issue for FP, and lifting these loads is potentially important.
|
||||||
// Need detailed profile data to be sure.
|
// Need detailed profile data to be sure.
|
||||||
|
#if 0
|
||||||
#define PREFETCH_U(A) \
|
#define PREFETCH_U(A) \
|
||||||
LOAD64(%r8,&U._odata[sU](A)) \
|
LOAD64(%r8,&U._odata[sU](A)) \
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
@ -230,7 +230,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VPREFETCHW(9,%r8) \
|
VPREFETCHW(9,%r8) \
|
||||||
VPREFETCHW(10,%r8) \
|
VPREFETCHW(10,%r8) \
|
||||||
VPREFETCHW(11,%r8) );
|
VPREFETCHW(11,%r8) );
|
||||||
|
#endif
|
||||||
|
|
||||||
#define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
|
#define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
|
||||||
|
|
||||||
@ -244,6 +244,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
|
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
|
||||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
|
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
|
||||||
|
|
||||||
|
#if 0
|
||||||
#define MULT_2SPIN_UNOPT(ptr) \
|
#define MULT_2SPIN_UNOPT(ptr) \
|
||||||
LOAD64(%r8,ptr) \
|
LOAD64(%r8,ptr) \
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
@ -289,6 +290,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
ZEND2(UChi_11,Z3,Chi_10) \
|
ZEND2(UChi_11,Z3,Chi_10) \
|
||||||
ZEND2(UChi_02,Z4,Chi_02) \
|
ZEND2(UChi_02,Z4,Chi_02) \
|
||||||
ZEND2(UChi_12,Z5,Chi_12) );
|
ZEND2(UChi_12,Z5,Chi_12) );
|
||||||
|
#endif
|
||||||
|
|
||||||
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
|
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
|
||||||
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
|
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
|
||||||
@ -299,10 +301,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
|
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
|
||||||
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
|
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
|
||||||
|
|
||||||
#define MULT_2SPINa(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
|
// MULT_2SPINa(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
|
||||||
#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr);
|
#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr);
|
||||||
|
|
||||||
|
#if 0
|
||||||
#define MULT_2SPIN_PF(ptr,pf,VPF) \
|
#define MULT_2SPIN_PF(ptr,pf,VPF) \
|
||||||
LOAD64(%r8,ptr) \
|
LOAD64(%r8,ptr) \
|
||||||
LOAD64(%r9,pf) \
|
LOAD64(%r9,pf) \
|
||||||
@ -343,8 +345,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
ZEND2(UChi_02,Z4,Chi_02) \
|
ZEND2(UChi_02,Z4,Chi_02) \
|
||||||
VPF(11,%r9) \
|
VPF(11,%r9) \
|
||||||
ZEND2(UChi_12,Z5,Chi_12) );
|
ZEND2(UChi_12,Z5,Chi_12) );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if 0
|
||||||
#define MULT_2SPIN_PFNONE(ptr,pf,VPF) \
|
#define MULT_2SPIN_PFNONE(ptr,pf,VPF) \
|
||||||
LOAD64(%r8,ptr) \
|
LOAD64(%r8,ptr) \
|
||||||
LOAD64(%r9,pf) \
|
LOAD64(%r9,pf) \
|
||||||
@ -364,7 +367,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VPF(9,%r9) \
|
VPF(9,%r9) \
|
||||||
VPF(10,%r9) \
|
VPF(10,%r9) \
|
||||||
VPF(11,%r9) );
|
VPF(11,%r9) );
|
||||||
|
#endif
|
||||||
|
|
||||||
// Pretty much Perfectly Pipelined
|
// Pretty much Perfectly Pipelined
|
||||||
|
|
||||||
@ -720,7 +723,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VSUB(UChi_11,result_31,result_31)\
|
VSUB(UChi_11,result_31,result_31)\
|
||||||
VSUB(UChi_12,result_32,result_32) );
|
VSUB(UChi_12,result_32,result_32) );
|
||||||
|
|
||||||
#define PREFETCH_CHIMU(A)
|
//define PREFETCH_CHIMU(A)
|
||||||
|
|
||||||
#define PERMUTE_DIR0 __asm__ ( \
|
#define PERMUTE_DIR0 __asm__ ( \
|
||||||
VPERM0(Chi_00,Chi_00) \
|
VPERM0(Chi_00,Chi_00) \
|
||||||
|
Loading…
x
Reference in New Issue
Block a user