From 805255627593de52c8e68fe994e0a7ee9481faaa Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 31 Mar 2016 14:51:32 +0100 Subject: [PATCH 1/9] Cleaning up the single/double kernel implementation switch --- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 97 +-------------- lib/simd/Intel512double.h | 135 +++++++++++++++++++++ lib/simd/Intel512single.h | 135 +++++++++++++++++++++ lib/simd/Intel512wilson.h | 17 +-- 4 files changed, 281 insertions(+), 103 deletions(-) create mode 100644 lib/simd/Intel512double.h create mode 100644 lib/simd/Intel512single.h diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index b5f016f5..bdda199f 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -32,81 +32,8 @@ Author: paboyle #include -#undef VLOAD -#undef VSTORE -#undef VMUL -#undef VMADD -#undef ZEND -#undef ZLOAD -#undef ZMUL -#undef ZMADD -#undef VZERO -#undef VTIMESI -#undef VTIMESMINUSI -#undef VMOVIDUP -#undef VMOVRDUP -#undef VMADDSUB -#undef VSHUF +#include -#define VZERO(A) VZEROf(A) -#define VMOV(A,B) VMOVf(A,B) -#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST) -#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC) - -#define VADD(A,B,C) VADDf(A,B,C) -#define VSUB(A,B,C) VSUBf(A,B,C) -#define VMUL(Uri,Uir,Chi) VMULf(Uri,Uir,Chi) -#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi) - -#define VTIMESI(A,B,C) VTIMESIf(A,B,C) -#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C) -#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C) -#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C) - -#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C) -#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C) -#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C) -#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C) - -#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C) -#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C) -#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C) -#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C) - -#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C) -#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C) -#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C) -#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C) - -#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P) -#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P) -#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P) -#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P) - -#define VPERM0(A,B) VPERM0f(A,B) -#define VPERM1(A,B) VPERM1f(A,B) -#define VPERM2(A,B) VPERM2f(A,B) -#define VPERM3(A,B) VPERM3f(A,B) -#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST) - -#define ZEND1(A,B,C) ZEND1f(A,B,C) -#define ZEND2(A,B,C) ZEND2f(A,B,C) -#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D) -#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E) -#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E) - -#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E) -#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E) - -#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C) -#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C) -#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C) -#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C) -#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum) -#define VSHUF(A,B) VSHUFf(A,B) - -#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) -#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) namespace Grid { namespace QCD { @@ -136,26 +63,6 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField SE=st.GetEntry(ptype,Xm,ss); -#if 0 - if (SE->_is_local) pf=(void *)&plocal[SE->_offset]; - else pf=(void *)&pbuf[SE->_offset]; - - LOAD64(%r9,pf); - __asm__( - VPREFETCH(0,%r9) - VPREFETCH(1,%r9) - VPREFETCH(2,%r9) - VPREFETCH(3,%r9) - VPREFETCH(4,%r9) - VPREFETCH(5,%r9) - VPREFETCH(6,%r9) - VPREFETCH(7,%r9) - VPREFETCH(8,%r9) - VPREFETCH(9,%r9) - VPREFETCH(10,%r9) - VPREFETCH(11,%r9) ); -#endif - // Xm offset = SE->_offset; local = SE->_is_local; @@ -322,8 +229,6 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField offset = SE->_offset; local = SE->_is_local; - // PREFETCH_R(A); - // Prefetch SE=st.GetEntry(ptype,Xm,(ss+1)%osites); if (SE->_is_local) pf=(void *)&plocal[SE->_offset]; diff --git a/lib/simd/Intel512double.h b/lib/simd/Intel512double.h new file mode 100644 index 00000000..a2e9d38f --- /dev/null +++ b/lib/simd/Intel512double.h @@ -0,0 +1,135 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/simd/Avx512Asm.h + + Copyright (C) 2015 + +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +// No guard can be multiply included as undef clearage +#undef VZERO +#undef VMOV +#undef VLOAD +#undef VSTORE +#define VZERO(A) VZEROd(A) +#define VMOV(A,B) VMOVd(A,B) +#define VLOAD(OFF,PTR,DEST) VLOADd(OFF,PTR,DEST) +#define VSTORE(OFF,PTR,SRC) VSTOREd(OFF,PTR,SRC) + +#undef VADD +#undef VSUB +#undef VMUL +#undef VMADD +#define VADD(A,B,C) VADDd(A,B,C) +#define VSUB(A,B,C) VSUBd(A,B,C) +#define VMUL(Uri,Uir,Chi) VMULd(Uri,Uir,Chi) +#define VMADD(Uri,Uir,Chi) VMADDd(Uri,Uir,Chi) + + +#undef VTIMESI +#undef VTIMESI0 +#undef VTIMESI1 +#undef VTIMESI2 +#define VTIMESI(A,B,C) VTIMESId(A,B,C) +#define VTIMESI0(A,B,C) VTIMESI0d(A,B,C) +#define VTIMESI1(A,B,C) VTIMESI1d(A,B,C) +#define VTIMESI2(A,B,C) VTIMESI2d(A,B,C) + +#undef VTIMESMINUSI +#undef VTIMESMINUSI0 +#undef VTIMESMINUSI1 +#undef VTIMESMINUSI2 +#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C) +#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0d(A,B,C) +#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1d(A,B,C) +#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2d(A,B,C) + +#undef VACCTIMESI +#undef VACCTIMESI0 +#undef VACCTIMESI1 +#undef VACCTIMESI2 +#define VACCTIMESI(A,B,C) VACCTIMESId(A,B,C) +#define VACCTIMESI0(A,B,C) VACCTIMESI0d(A,B,C) +#define VACCTIMESI1(A,B,C) VACCTIMESI1d(A,B,C) +#define VACCTIMESI2(A,B,C) VACCTIMESI2d(A,B,C) + +#undef VACCTIMESMINUSI +#undef VACCTIMESMINUSI0 +#undef VACCTIMESMINUSI1 +#undef VACCTIMESMINUSI2 +#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSId(A,B,C) +#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0d(A,B,C) +#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1d(A,B,C) +#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2d(A,B,C) + +#undef VACCTIMESI1MEM +#undef VACCTIMESI2MEM +#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMd(A,ACC,O,P) +#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMd(A,ACC,O,P) + +#undef VACCTIMESMINUSI1MEM +#undef VACCTIMESMINUSI2MEM +#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P) +#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P) + +#undef VPERM0 +#undef VPERM1 +#undef VPERM2 +#undef VPERM3 +#define VPERM0(A,B) VPERM0d(A,B) +#define VPERM1(A,B) VPERM1d(A,B) +#define VPERM2(A,B) VPERM2d(A,B) +#define VPERM3(A,B) VPERM3d(A,B) + +#undef VSHUFMEM +#undef VADDMEM +#undef VSUBMEM +#define VSHUFMEM(OFF,A,DEST) VSHUFMEMd(OFF,A,DEST) +#define VADDMEM(O,A,B,C) VADDMEMd(O,A,B,C) +#define VSUBMEM(O,A,B,C) VSUBMEMd(O,A,B,C) + +#undef VMOVIDUP +#undef VMOVRDUP +#undef VMADDSBUB +#undef VSHUF +#define VMOVIDUP(A,B,C) VMOVIDUPd(A,B,C) +#define VMOVRDUP(A,B,C) VMOVRDUPd(A,B,C) +#define VMADDSUB(A,B,accum) VMADDSUBd(A,B,accum) +#define VSHUD(A,B) VSHUFd(A,B) + + +#undef ZEND1 +#undef ZEND2 +#undef ZLOAD +#undef ZMUL +#undef ZMADD +#undef ZMULMEM2SP +#undef ZMADDMEM2SP + +#define ZEND1(A,B,C) ZEND1d(A,B,C) +#define ZEND2(A,B,C) ZEND2d(A,B,C) +#define ZLOAD(A,B,C,D) ZLOADd(A,B,C,D) +#define ZMUL(A,B,C,D,E) ZMULd(A,B,C,D,E) +#define ZMADD(A,B,C,D,E) ZMADDd(A,B,C,D,E) +#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) +#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) + diff --git a/lib/simd/Intel512single.h b/lib/simd/Intel512single.h new file mode 100644 index 00000000..c94a7852 --- /dev/null +++ b/lib/simd/Intel512single.h @@ -0,0 +1,135 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/simd/Avx512Asm.h + + Copyright (C) 2015 + +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +// No guard can be multiply included as undef clearge of macros +#undef VZERO +#undef VMOV +#undef VLOAD +#undef VSTORE +#define VZERO(A) VZEROf(A) +#define VMOV(A,B) VMOVf(A,B) +#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST) +#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC) + +#undef VADD +#undef VSUB +#undef VMUL +#undef VMADD +#define VADD(A,B,C) VADDf(A,B,C) +#define VSUB(A,B,C) VSUBf(A,B,C) +#define VMUL(Uri,Uir,Chi) VMULf(Uri,Uir,Chi) +#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi) + + +#undef VTIMESI +#undef VTIMESI0 +#undef VTIMESI1 +#undef VTIMESI2 +#define VTIMESI(A,B,C) VTIMESIf(A,B,C) +#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C) +#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C) +#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C) + +#undef VTIMESMINUSI +#undef VTIMESMINUSI0 +#undef VTIMESMINUSI1 +#undef VTIMESMINUSI2 +#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C) +#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C) +#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C) +#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C) + +#undef VACCTIMESI +#undef VACCTIMESI0 +#undef VACCTIMESI1 +#undef VACCTIMESI2 +#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C) +#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C) +#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C) +#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C) + +#undef VACCTIMESMINUSI +#undef VACCTIMESMINUSI0 +#undef VACCTIMESMINUSI1 +#undef VACCTIMESMINUSI2 +#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C) +#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C) +#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C) +#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C) + +#undef VACCTIMESI1MEM +#undef VACCTIMESI2MEM +#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P) +#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P) + +#undef VACCTIMESMINUSI1MEM +#undef VACCTIMESMINUSI2MEM +#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P) +#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P) + +#undef VPERM0 +#undef VPERM1 +#undef VPERM2 +#undef VPERM3 +#define VPERM0(A,B) VPERM0f(A,B) +#define VPERM1(A,B) VPERM1f(A,B) +#define VPERM2(A,B) VPERM2f(A,B) +#define VPERM3(A,B) VPERM3f(A,B) + +#undef VSHUFMEM +#undef VADDMEM +#undef VSUBMEM +#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST) +#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C) +#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C) + +#undef VMOVIDUP +#undef VMOVRDUP +#undef VMADDSBUB +#undef VSHUF +#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C) +#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C) +#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum) +#define VSHUF(A,B) VSHUFf(A,B) + + +#undef ZEND1 +#undef ZEND2 +#undef ZLOAD +#undef ZMUL +#undef ZMADD +#undef ZMULMEM2SP +#undef ZMADDMEM2SP + +#define ZEND1(A,B,C) ZEND1f(A,B,C) +#define ZEND2(A,B,C) ZEND2f(A,B,C) +#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D) +#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E) +#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E) +#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) +#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) + diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 2d0e1e35..64087ea6 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -201,7 +201,7 @@ Author: paboyle // Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed. // KNL is DUAL issue for FP, and lifting these loads is potentially important. // Need detailed profile data to be sure. - +#if 0 #define PREFETCH_U(A) \ LOAD64(%r8,&U._odata[sU](A)) \ __asm__ ( \ @@ -230,7 +230,7 @@ Author: paboyle VPREFETCHW(9,%r8) \ VPREFETCHW(10,%r8) \ VPREFETCHW(11,%r8) ); - +#endif #define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A)) @@ -244,6 +244,7 @@ Author: paboyle #define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p) #define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p) +#if 0 #define MULT_2SPIN_UNOPT(ptr) \ LOAD64(%r8,ptr) \ __asm__ ( \ @@ -289,6 +290,7 @@ Author: paboyle ZEND2(UChi_11,Z3,Chi_10) \ ZEND2(UChi_02,Z4,Chi_02) \ ZEND2(UChi_12,Z5,Chi_12) ); +#endif #define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr) #define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr) @@ -299,10 +301,10 @@ Author: paboyle #define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr) #define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr) -#define MULT_2SPINa(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG); +// MULT_2SPINa(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG); #define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr); - +#if 0 #define MULT_2SPIN_PF(ptr,pf,VPF) \ LOAD64(%r8,ptr) \ LOAD64(%r9,pf) \ @@ -343,8 +345,9 @@ Author: paboyle ZEND2(UChi_02,Z4,Chi_02) \ VPF(11,%r9) \ ZEND2(UChi_12,Z5,Chi_12) ); +#endif - +#if 0 #define MULT_2SPIN_PFNONE(ptr,pf,VPF) \ LOAD64(%r8,ptr) \ LOAD64(%r9,pf) \ @@ -364,7 +367,7 @@ Author: paboyle VPF(9,%r9) \ VPF(10,%r9) \ VPF(11,%r9) ); - +#endif // Pretty much Perfectly Pipelined @@ -720,7 +723,7 @@ Author: paboyle VSUB(UChi_11,result_31,result_31)\ VSUB(UChi_12,result_32,result_32) ); -#define PREFETCH_CHIMU(A) +//define PREFETCH_CHIMU(A) #define PERMUTE_DIR0 __asm__ ( \ VPERM0(Chi_00,Chi_00) \ From f7b1060aedb8a5a264897cb2a6d9119a3e96206a Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 31 Mar 2016 14:52:37 +0100 Subject: [PATCH 2/9] Use headers to clear macros and sub precision --- tests/Test_zmm.cc | 78 ++--------------------------------------------- 1 file changed, 2 insertions(+), 76 deletions(-) diff --git a/tests/Test_zmm.cc b/tests/Test_zmm.cc index 0417ce8d..90d92c46 100644 --- a/tests/Test_zmm.cc +++ b/tests/Test_zmm.cc @@ -252,39 +252,7 @@ int main(int argc,char **argv) #endif } -#undef VLOAD -#undef VSTORE -#undef VMUL -#undef VMADD -#undef ZEND1 -#undef ZEND2 -#undef ZLOAD -#undef ZMUL -#undef ZMADD -#undef VMOVIDUP -#undef VMOVRDUP -#undef VMADDSUB -#undef VSHUF - -#define VZERO(A) VZEROd(A) -#define VTIMESI(A,B,C) VTIMESId(A,B,C) -#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C) - -#define VLOAD(OFF,PTR,DEST) VLOADd(OFF,PTR,DEST) -#define VSTORE(OFF,PTR,SRC) VSTOREd(OFF,PTR,SRC) -#define VMUL(Uri,Uir,Chi) VMULd(Uri,Uir,Chi) -#define VMADD(Uri,Uir,Chi) VMADDd(Uri,Uir,Chi) -#define ZEND1(A,B,C) ZEND1d(A,B,C) -#define ZEND2(A,B,C) ZEND2d(A,B,C) -#define ZLOAD(A,B,C,D) ZLOADd(A,B,C,D) -#define ZMUL(A,B,C,D,E) ZMULd(A,B,C,D,E) -#define ZMADD(A,B,C,D,E) ZMADDd(A,B,C,D,E) -#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) -#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) -#define VMOVIDUP(A,B,C) VMOVIDUPd(A,B,C) -#define VMOVRDUP(A,B,C) VMOVRDUPd(A,B,C) -#define VMADDSUB(A,B,accum) VMADDSUBd(A,B,accum) -#define VSHUF(A,B) VSHUFd(A,B) +#include #define zz Z0 @@ -415,49 +383,7 @@ void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3) } -#undef VLOAD -#undef VSTORE -#undef VMUL -#undef VMADD -#undef ZEND1 -#undef ZEND2 -#undef ZLOAD -#undef ZMUL -#undef ZMADD -#undef VZERO -#undef VTIMESI -#undef VTIMESI0 -#undef VTIMESI1 -#undef VTIMESI2 -#undef VTIMESMINUSI -#undef ZMULMEM2SP -#undef ZMADDMEM2SP -#undef VMOVIDUP -#undef VMOVRDUP -#undef VMADDSUB -#undef VSHUF - -#define VZERO(A) VZEROf(A) -#define VMOV(A,B) VMOVf(A,B) -#define VADD(A,B,C) VADDf(A,B,C) -#define VSUB(A,B,C) VSUBf(A,B,C) -#define VTIMESI(A,B,C) VTIMESIf(A,B,C) -#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C) -#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST) -#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC) -#define VMUL(Uri,Uir,Chi) VMULf(Uri,Uir,Chi) -#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi) -#define ZEND1(A,B,C) ZEND1f(A,B,C) -#define ZEND2(A,B,C) ZEND2f(A,B,C) -#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D) -#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E) -#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E) -#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) -#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) -#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C) -#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C) -#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum) -#define VSHUF(A,B) VSHUFf(A,B) +#include void ZmulF(void *ptr1,void *ptr2,void *ptr3) { From f473ef75910c4c1950c15d1cc0c77b354b53031f Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 31 Mar 2016 07:47:42 -0700 Subject: [PATCH 3/9] Fixing the compile --- lib/simd/Intel512double.h | 4 ++-- lib/simd/Intel512single.h | 2 +- lib/simd/Intel512wilson.h | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/simd/Intel512double.h b/lib/simd/Intel512double.h index a2e9d38f..2b2b9099 100644 --- a/lib/simd/Intel512double.h +++ b/lib/simd/Intel512double.h @@ -109,12 +109,12 @@ Author: paboyle #undef VMOVIDUP #undef VMOVRDUP -#undef VMADDSBUB +#undef VMADDSUB #undef VSHUF #define VMOVIDUP(A,B,C) VMOVIDUPd(A,B,C) #define VMOVRDUP(A,B,C) VMOVRDUPd(A,B,C) #define VMADDSUB(A,B,accum) VMADDSUBd(A,B,accum) -#define VSHUD(A,B) VSHUFd(A,B) +#define VSHUF(A,B) VSHUFd(A,B) #undef ZEND1 diff --git a/lib/simd/Intel512single.h b/lib/simd/Intel512single.h index c94a7852..3a39c6a4 100644 --- a/lib/simd/Intel512single.h +++ b/lib/simd/Intel512single.h @@ -109,7 +109,7 @@ Author: paboyle #undef VMOVIDUP #undef VMOVRDUP -#undef VMADDSBUB +#undef VMADDSUB #undef VSHUF #define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C) #define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C) diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 64087ea6..243b89ed 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -302,7 +302,6 @@ Author: paboyle #define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr) // MULT_2SPINa(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG); -#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr); #if 0 #define MULT_2SPIN_PF(ptr,pf,VPF) \ @@ -816,4 +815,6 @@ Author: paboyle VMADDSUB(Z5,Chi_12,UChi_12)\ ); +#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr) + #endif From e67fc2be183bc7da011ee087e1e0219e7843d11d Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 31 Mar 2016 16:00:37 +0100 Subject: [PATCH 4/9] Adding a trial for openmp overhead minimisation --- lib/qcd/action/fermion/WilsonFermion5D.cc | 120 ++++++++++++++++++++++ lib/qcd/action/fermion/WilsonFermion5D.h | 7 ++ 2 files changed, 127 insertions(+) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index b78f030e..581a3fc5 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -418,6 +418,126 @@ PARALLEL_FOR_LOOP alltime+=usecond(); } +template +void WilsonFermion5D::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U, + const FermionField &in, FermionField &out,int dag) +{ + // assert((dag==DaggerNo) ||(dag==DaggerYes)); + alltime-=usecond(); + Compressor compressor(dag); + + // Assume balanced KMP_AFFINITY; this is forced in GridThread.h + + int threads = GridThread::GetThreads(); + int HT = GridThread::GetHyperThreads(); + int cores = GridThread::GetCores(); + int nwork = U._grid->oSites(); + + commtime -=usecond(); + auto handle = st.HaloExchangeBegin(in,compressor); + st.HaloExchangeComplete(handle); + commtime +=usecond(); + + jointime -=usecond(); + jointime +=usecond(); + + // Dhop takes the 4d grid from U, and makes a 5d index for fermion + // Not loop ordering and data layout. + // Designed to create + // - per thread reuse in L1 cache for U + // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable. + +#pragma omp parallel + { + for(int jjj=0;jjj<1000;jjj++){ +#pragma omp barrier + dslashtime -=usecond(); + if ( dag == DaggerYes ) { + if( this->HandOptDslash ) { +#pragma omp for + for(int ss=0;ssoSites();ss++){ + int sU=ss; + for(int s=0;soSites();ss++){ + { + int sd; + for(sd=0;sdAsmOptDslash ) { + // for(int i=0;i<1;i++){ + // for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ + // PerformanceCounter Counter(i); + // Counter.Start(); + +#pragma omp for + for(int t=0;tHandOptDslash ) { +#pragma omp for + + for(int ss=0;ssoSites();ss++){ + int sU=ss; + for(int s=0;soSites();ss++){ + int sU=ss; + for(int s=0;s void WilsonFermion5D::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index 30e663e8..164a3c1a 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -120,6 +120,13 @@ namespace Grid { FermionField &out, int dag); + void DhopInternalOMPbench(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, + int dag); + void DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField &U, From e8dddb1596f271c15fa2161dca47f6ea672c8918 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 6 Apr 2016 10:32:54 +0100 Subject: [PATCH 5/9] Adding extra benchmark --- benchmarks/Benchmark_zmm.cc | 174 ++++++++++++++++++++++ lib/qcd/action/fermion/WilsonFermion5D.cc | 120 ++++++++++++++- 2 files changed, 293 insertions(+), 1 deletion(-) create mode 100644 benchmarks/Benchmark_zmm.cc diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc new file mode 100644 index 00000000..f7bc8e8e --- /dev/null +++ b/benchmarks/Benchmark_zmm.cc @@ -0,0 +1,174 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_zmm.cc + + Copyright (C) 2015 + +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + + +using namespace Grid; +using namespace Grid::QCD; + +void ZmulF(void *ptr1,void *ptr2,void *ptr3); +void Zmul(void *ptr1,void *ptr2,void *ptr3); +void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3); +void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3); +void TimesIAvx512F(void *ptr1,void *ptr3); +void TimesIAvx512(void *ptr1,void *ptr3); +void TimesMinusIAvx512F(void *ptr1,void *ptr3); +void TimesMinusIAvx512(void *ptr1,void *ptr3); + + +int bench(std::ofstream &os, std::vector &latt4,int Ls); + +int main(int argc,char **argv) +{ + Grid_init(&argc,&argv); + std::ofstream os("zmm.dat"); + + os << "#V Ls Lxy Lzt C++ Asm OMP L1 " < grid({L,L,m*L,m*L}); + bench(os,latt4,Ls); + } + } + } +} + +int bench(std::ofstream &os, std::vector &latt4,int Ls) +{ + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + int threads = GridThread::GetThreads(); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); + + LatticeFermion src (FGrid); + LatticeFermion tmp (FGrid); + LatticeFermion srce(FrbGrid); + + LatticeFermion resulto(FrbGrid); resulto=zero; + LatticeFermion resulta(FrbGrid); resulta=zero; + LatticeFermion junk(FrbGrid); junk=zero; + LatticeFermion diff(FrbGrid); + LatticeGaugeField Umu(UGrid); + + double mfc, mfa, mfo, mfl1; + + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + random(RNG5,src); +#if 1 + random(RNG4,Umu); +#else + int mmu=2; + std::vector U(4,UGrid); + for(int mu=0;mu(Umu,mu); + if ( mu!=mmu ) U[mu] = zero; + if ( mu==mmu ) U[mu] = 1.0; + PokeIndex(Umu,U[mu],mu); + } +#endif + pickCheckerboard(Even,srce,src); + + RealD mass=0.1; + RealD M5 =1.8; + DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + std::cout<::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder #pragma omp parallel { - for(int jjj=0;jjj<1000;jjj++){ + for(int jjj=0;jjj<100;jjj++){ #pragma omp barrier dslashtime -=usecond(); if ( dag == DaggerYes ) { @@ -538,6 +538,124 @@ void WilsonFermion5D::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder alltime+=usecond(); } + +template +void WilsonFermion5D::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U, + const FermionField &in, FermionField &out,int dag) +{ + // assert((dag==DaggerNo) ||(dag==DaggerYes)); + alltime-=usecond(); + Compressor compressor(dag); + + // Assume balanced KMP_AFFINITY; this is forced in GridThread.h + + int threads = GridThread::GetThreads(); + int HT = GridThread::GetHyperThreads(); + int cores = GridThread::GetCores(); + int nwork = U._grid->oSites(); + + commtime -=usecond(); + auto handle = st.HaloExchangeBegin(in,compressor); + st.HaloExchangeComplete(handle); + commtime +=usecond(); + + jointime -=usecond(); + jointime +=usecond(); + + // Dhop takes the 4d grid from U, and makes a 5d index for fermion + // Not loop ordering and data layout. + // Designed to create + // - per thread reuse in L1 cache for U + // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable. + +#pragma omp parallel + { + for(int jjj=0;jjj<100;jjj++){ +#pragma omp barrier + dslashtime -=usecond(); + if ( dag == DaggerYes ) { + if( this->HandOptDslash ) { +#pragma omp for + for(int ss=0;ssoSites();ss++){ + int sU=0; + for(int s=0;soSites();ss++){ + { + int sd; + for(sd=0;sdAsmOptDslash ) { + // for(int i=0;i<1;i++){ + // for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ + // PerformanceCounter Counter(i); + // Counter.Start(); + +#pragma omp for + for(int t=0;tHandOptDslash ) { +#pragma omp for + + for(int ss=0;ssoSites();ss++){ + int sU=0; + for(int s=0;soSites();ss++){ + int sU=0; + for(int s=0;s void WilsonFermion5D::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, From b1192a89085c1fbafe90fc1b03c1bc486c80353b Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 6 Apr 2016 03:00:07 -0700 Subject: [PATCH 6/9] Benchmark_zmm added --- benchmarks/Benchmark_zmm.cc | 13 ++----------- benchmarks/Make.inc | 6 +++++- lib/Make.inc | 4 ++-- lib/qcd/action/fermion/WilsonFermion5D.h | 10 +++++++++- tests/Make.inc | 22 +++++++++++----------- 5 files changed, 29 insertions(+), 26 deletions(-) diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc index f7bc8e8e..a82fb23c 100644 --- a/benchmarks/Benchmark_zmm.cc +++ b/benchmarks/Benchmark_zmm.cc @@ -27,21 +27,11 @@ Author: paboyle /* END LEGAL */ #include #include -#include using namespace Grid; using namespace Grid::QCD; -void ZmulF(void *ptr1,void *ptr2,void *ptr3); -void Zmul(void *ptr1,void *ptr2,void *ptr3); -void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3); -void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3); -void TimesIAvx512F(void *ptr1,void *ptr3); -void TimesIAvx512(void *ptr1,void *ptr3); -void TimesMinusIAvx512F(void *ptr1,void *ptr3); -void TimesMinusIAvx512(void *ptr1,void *ptr3); - int bench(std::ofstream &os, std::vector &latt4,int Ls); @@ -55,7 +45,7 @@ int main(int argc,char **argv) for(int m=1;m<=2;m++){ for(int Ls=8;Ls<=16;Ls+=8){ std::vector grid({L,L,m*L,m*L}); - bench(os,latt4,Ls); + bench(os,grid,Ls); } } } @@ -134,6 +124,7 @@ int bench(std::ofstream &os, std::vector &latt4,int Ls) mfa = flops*ncall/(t1-t0); std::cout< grid({L,L,m*L,m*L}); @@ -127,7 +127,7 @@ int bench(std::ofstream &os, std::vector &latt4,int Ls) int dag=DaggerNo; t0=usecond(); for(int i=0;i<1;i++){ - Dw.DhopInternalOMPbench(Dw.StencilOdd,Dw.LebesgueEvenOdd,Dw.UmuEven,srce,junk,dag); + Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag); } t1=usecond(); mfo = flops*100/(t1-t0); @@ -135,7 +135,7 @@ int bench(std::ofstream &os, std::vector &latt4,int Ls) t0=usecond(); for(int i=0;i<1;i++){ - Dw.DhopInternalL1bench(Dw.StencilOdd,Dw.LebesgueEvenOdd,Dw.UmuEven,srce,junk,dag); + Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag); } t1=usecond(); mfl1= flops*100/(t1-t0); @@ -147,6 +147,7 @@ int bench(std::ofstream &os, std::vector &latt4,int Ls) << mfo<<" " << mfl1< &latt4,int Ls) Counter.Stop(); Counter.Report(); } +#endif //resulta = (-0.5) * resulta; diff = resulto-resulta; From a524ca2a4baefca48536cdd936776bbdf4d3c423 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 6 Apr 2016 03:35:56 -0700 Subject: [PATCH 8/9] New benchmark update --- benchmarks/Benchmark_zmm.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc index 12543c6d..5c84ad79 100644 --- a/benchmarks/Benchmark_zmm.cc +++ b/benchmarks/Benchmark_zmm.cc @@ -41,10 +41,14 @@ int main(int argc,char **argv) std::ofstream os("zmm.dat"); os << "#V Ls Lxy Lzt C++ Asm OMP L1 " < grid({L,L,m*L,m*L}); + for(int i=0;i<4;i++) { + std::cout << grid[i]<<"x"; + } + std::cout << Ls< Date: Wed, 6 Apr 2016 06:52:09 -0700 Subject: [PATCH 9/9] Smaller vols too --- benchmarks/Benchmark_zmm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc index 5c84ad79..ebe7282e 100644 --- a/benchmarks/Benchmark_zmm.cc +++ b/benchmarks/Benchmark_zmm.cc @@ -41,7 +41,7 @@ int main(int argc,char **argv) std::ofstream os("zmm.dat"); os << "#V Ls Lxy Lzt C++ Asm OMP L1 " < grid({L,L,m*L,m*L});