From c77b7ee8971c80d4370771d10a916b2280f8a8d2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 28 Mar 2016 17:55:22 -0600 Subject: [PATCH] AddSub based alternate SU3 routine --- benchmarks/Benchmark_dwf.cc | 4 +- lib/Make.inc | 4 +- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 2 +- lib/simd/Avx512Asm.h | 1044 -------------------- lib/simd/Intel512avx.h | 10 + lib/simd/Intel512avxAddsub.h | 14 +- lib/simd/Intel512wilson.h | 67 +- tests/Make.inc | 22 +- tests/Test_zmm.cc | 35 +- 9 files changed, 116 insertions(+), 1086 deletions(-) delete mode 100644 lib/simd/Avx512Asm.h diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 815c8590..e14af557 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -58,7 +58,7 @@ int main (int argc, char ** argv) std::cout< latt4 = GridDefaultLatt(); - const int Ls=16; + const int Ls=8; GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); @@ -122,7 +122,7 @@ int main (int argc, char ** argv) DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); std::cout< #if defined(AVX512) || defined (IMCI) //#if defined (IMCI) -#include +#include #undef VLOAD #undef VSTORE diff --git a/lib/simd/Avx512Asm.h b/lib/simd/Avx512Asm.h deleted file mode 100644 index 8363c2ab..00000000 --- a/lib/simd/Avx512Asm.h +++ /dev/null @@ -1,1044 +0,0 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/simd/Avx512Asm.h - - Copyright (C) 2015 - -Author: paboyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef GRID_ASM_AV512_H -#define GRID_ASM_AV512_H - -// Serialisation elimination: -// i) ZEND -> ZEND1, ZEND2, 6 fold round robin. -// ii) TimesI -> TimesI_1, TimesI_2, 6 fold round robin -// - -////////////////////////////////////////////////////////////////////////////////////////// -// Register allocations for Wilson Kernel are precision and IMCI/AVX512 indept -////////////////////////////////////////////////////////////////////////////////////////// -#define result_00 %zmm0 -#define result_01 %zmm1 -#define result_02 %zmm2 - -#define result_10 %zmm3 -#define result_11 %zmm4 -#define result_12 %zmm5 - -#define result_20 %zmm6 -#define result_21 %zmm7 -#define result_22 %zmm8 - -#define result_30 %zmm9 -#define result_31 %zmm10 -#define result_32 %zmm11 - -#define Chi_00 %zmm12 -#define Chi_01 %zmm13 -#define Chi_02 %zmm14 - -#define Chi_10 %zmm15 -#define Chi_11 %zmm16 -#define Chi_12 %zmm17 - -#define UChi_00 %zmm18 -#define UChi_01 %zmm19 -#define UChi_02 %zmm20 - -#define UChi_10 %zmm21 -#define UChi_11 %zmm22 -#define UChi_12 %zmm23 - -#define Uir %zmm24 -//#define ONE %zmm24 -#define Uri %zmm25 - -#define Z0 %zmm26 -#define Z1 %zmm27 -#define Z2 %zmm28 -#define Z3 %zmm29 -#define Z4 %zmm30 -#define Z5 %zmm31 - -#define TMP Chi_00 - -#define Chimu_00 Chi_00 -#define Chimu_01 Chi_01 -#define Chimu_02 Chi_02 -#define Chimu_10 Chi_10 -#define Chimu_11 Chi_11 -#define Chimu_12 Chi_12 -#define Chimu_20 UChi_00 -#define Chimu_21 UChi_01 -#define Chimu_22 UChi_02 -#define Chimu_30 UChi_10 -#define Chimu_31 UChi_11 -#define Chimu_32 UChi_12 - -////////////////////////////////////////////////////////////////////////////////////////// -// CONFIG IMCI/AVX512 -////////////////////////////////////////////////////////////////////////////////////////// - -#ifdef IMCI -#define ASM_IMCI -#endif - -#ifdef AVX512 -#define ASM_AVX512 -#endif - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// Opcodes common -//////////////////////////////////////////////////////////////////////////////////////////////////// - -#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n" -#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n" - -#define VTIMESIf(A,DEST, Z) \ - VTIMESI0f(A,DEST, Z) \ - VTIMESI1f(A,DEST, Z) \ - VTIMESI2f(A,DEST, Z) - -#define VTIMESId(A,DEST, Z) \ - VTIMESI0d(A,DEST, Z) \ - VTIMESI1d(A,DEST, Z) \ - VTIMESI2d(A,DEST, Z) - -#define VTIMESMINUSIf(A,DEST, Z) \ - VTIMESMINUSI0f(A,DEST, Z) \ - VTIMESMINUSI1f(A,DEST, Z) \ - VTIMESMINUSI2f(A,DEST, Z) - -#define VTIMESMINUSId(A,DEST, Z) \ - VTIMESMINUSI0d(A,DEST, Z) \ - VTIMESMINUSI1d(A,DEST, Z) \ - VTIMESMINUSI2d(A,DEST, Z) - -#define VACCTIMESIf(A,ACC,tmp) \ - VACCTIMESI0f(A,ACC,tmp) \ - VACCTIMESI1f(A,ACC,tmp) \ - VACCTIMESI2f(A,ACC,tmp) - -#define VACCTIMESId(A,ACC,tmp) \ - VACCTIMESI0d(A,ACC,tmp) \ - VACCTIMESI1d(A,ACC,tmp) \ - VACCTIMESI2d(A,ACC,tmp) - -#define VACCTIMESMINUSIf(A,ACC,tmp) \ - VACCTIMESMINUSI0f(A,ACC,tmp) \ - VACCTIMESMINUSI1f(A,ACC,tmp) \ - VACCTIMESMINUSI2f(A,ACC,tmp) - -#define VACCTIMESMINUSId(A,ACC,tmp) \ - VACCTIMESMINUSI0d(A,ACC,tmp) \ - VACCTIMESMINUSI1d(A,ACC,tmp) \ - VACCTIMESMINUSI2d(A,ACC,tmp) - -#define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A ); -#define LOAD64(A,ptr) LOAD64i(A,ptr) - -#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" -#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n" - - -#define VPREFETCHG(O,A) -#define VPREFETCHW(O,A) -//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n" -#define VEVICT(O,A) -// "clevict0 "#O"*64("#A");\n" - -#define VLOADf(OFF,PTR,DEST) "vmovaps " #OFF "*64(" #PTR "), " #DEST ";\n" -#define VLOADd(OFF,PTR,DEST) "vmovapd " #OFF "*64(" #PTR "), " #DEST ";\n" - -#define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n" -#define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n" - -#define VSUBf(A,B,DEST) "vsubps " #A "," #B "," #DEST ";\n" -#define VSUBd(A,B,DEST) "vsubpd " #A "," #B "," #DEST ";\n" - -#define VADDMEMf(O,A,B,DEST) "vaddps "#O"*64("#A ")," #B "," #DEST ";\n" -#define VADDMEMd(O,A,B,DEST) "vaddpd "#O"*64("#A ")," #B "," #DEST ";\n" - -#define VSUBMEMf(O,A,B,DEST) "vsubps "#O"*64("#A ")," #B "," #DEST ";\n" -#define VSUBMEMd(O,A,B,DEST) "vsubpd "#O"*64("#A ")," #B "," #DEST ";\n" - -#define VMULf(A,B,DEST) "vmulps " #A "," #B "," #DEST ";\n" -#define VMULd(A,B,DEST) "vmulpd " #A "," #B "," #DEST ";\n" - -#define VMADDf(A,B,DEST) "vfmadd231ps " #A "," #B "," #DEST ";\n" -#define VMADDd(A,B,DEST) "vfmadd231pd " #A "," #B "," #DEST ";\n" - -#define VMULMEMf(O,A,B,DEST) "vmulps " #O"*64("#A ")," #B "," #DEST ";\n" -#define VMULMEMd(O,A,B,DEST) "vmulpd " #O"*64("#A ")," #B "," #DEST ";\n" - -#define VMADDMEMf(O,A,B,DEST) "vfmadd231ps " #O"*64("#A "),"#B "," #DEST ";\n" -#define VMADDMEMd(O,A,B,DEST) "vfmadd231pd " #O"*64("#A "),"#B "," #DEST ";\n" - -#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri) -#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri) - -#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr) -#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr) - -#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr) -#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr) - -#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp) -#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp) - -#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMf(O,P,tmp) \ - VMULMEMf(O,P,B,Biirr) \ - VMULMEMf(O,P,C,Ciirr) \ - VMULf(tmp,B,Briir) \ - VMULf(tmp,C,Criir) - -#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMd(O,P,tmp) \ - VMULMEMd(O,P,B,Biirr) \ - VMULMEMd(O,P,C,Ciirr) \ - VMULd(tmp,B,Briir) \ - VMULd(tmp,C,Criir) - -#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ - VSHUFMEMf(O,P,tmp) \ - VMADDMEMf(O,P,B,Biirr) \ - VMADDMEMf(O,P,C,Ciirr) \ - VMADDf(tmp,B,Briir) \ - VMADDf(tmp,C,Criir) - -#define TRAP " int3 ;\n" - -#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \ - VSHUFMEMd(O,P,tmp) \ - VMADDMEMd(O,P,B,Biirr) \ - VMADDMEMd(O,P,C,Ciirr) \ - VMADDd(tmp,B,Briir) \ - VMADDd(tmp,C,Criir) - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// ISA changed between AVX512 and IMCI and requires arch dependent complex support -//////////////////////////////////////////////////////////////////////////////////////////////////// - -#define VPREFETCHNTA(O,A) -#define VPREFETCH(O,A) - -#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" -#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" - -// Swaps Re/Im ; could unify this with IMCI -#define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n" -#define VSHUFf(A,DEST) "vpshufd $0xb1," #A "," #DEST ";\n" -#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 1,0,3,2 -#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 2,3,0,1 - - -//////////////////////////////////////////////////////////// -// Knights Landing specials -//////////////////////////////////////////////////////////// -#ifdef ASM_AVX512 - -#define MASK_REGS \ - __asm__ ("mov $0xAAAA, %%eax \n"\ - "kmovw %%eax, %%k6 \n"\ - "mov $0x5555, %%eax \n"\ - "kmovw %%eax, %%k7 \n" : : : "%eax"); - -// Merges accumulation for complex dot chain; less efficient under avx512 -//ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n" -//ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n" -//ZEND1f(Criir,Ciirr, tmp) "vaddps " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n" -//ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n" -#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n"\ - "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" - -#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\ - "vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n" - -#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\ - "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" - -#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\ - "vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii - -// Further opt possible: KNC -- use swizzle operand ; no addsub. -// KNL -- addsub. Saves 6 ops, 12 cycles; KNL cost of loading "1" as only fmaddsub -// no swizzle on KNL. -#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST) -#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n" -#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n" - -#define VTIMESI0d(A,DEST, Z) VSHUFd(A,DEST) -#define VTIMESI1d(A,DEST, Z) "vaddpd " #DEST "," #Z "," #DEST"{%k6}" ";\n" -#define VTIMESI2d(A,DEST, Z) "vsubpd " #DEST "," #Z "," #DEST"{%k7}" ";\n" - -#define VTIMESMINUSI0f(A,DEST,Z) VSHUFf(A,DEST) -#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #DEST "," #Z "," #DEST"{%k6}" ";\n" -#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #DEST "," #Z "," #DEST"{%k7}" ";\n" - -#define VTIMESMINUSI0d(A,DEST,Z) VSHUFd(A,DEST) -#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n" -#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n" - -#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp) -#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n" -#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n" - -#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp) -#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n" -#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n" - -#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp) -#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k6}" ";\n" -#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k7}" ";\n" - -#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp) -#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n" -#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n" - -#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n" -#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n" -#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n" -#define VPERM3f(A,B) "vshufps $0xb1," #A "," #B "," #B ";\n" - -#define VPERM0d(A,B) "vshuff64x2 $0x4e," #A "," #B "," #B ";\n" -#define VPERM1d(A,B) "vshuff64x2 $0xb1," #A "," #B "," #B ";\n" -#define VPERM2d(A,B) "vshufpd $0x55," #A "," #B "," #B ";\n" -#define VPERM3d(A,B) VMOVd(A,B) - -#endif - -//////////////////////////////////////////////////////////// -// Knights Corner specials -//////////////////////////////////////////////////////////// - -#ifdef ASM_IMCI - -#define MASK_REGS \ - __asm__ ("mov $0xAAAA, %%eax \n"\ - "kmov %%eax, %%k6 \n"\ - "knot %%k6, %%k7 \n" : : : "%eax"); - -#define ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n" -#define ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n" - -#define ZEND1f(Criir,Ciirr, tmp) "vaddps " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n" -#define ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n" - -// Further opt possible: KNC -- use swizzle operand ; no addsub. -// KNL -- addsub. Saves 6 ops, 12 cycles; KNL cost of loading "1" as only fmaddsub -// no swizzle on KNL. -#define VTIMESI0f(A,DEST, Z) -#define VTIMESI1f(A,DEST, Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n" -#define VTIMESI2f(A,DEST, Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n" - -#define VTIMESI0d(A,DEST, Z) -#define VTIMESI1d(A,DEST, Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n" -#define VTIMESI2d(A,DEST, Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n" - -#define VTIMESMINUSI0f(A,DEST,Z) -#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n" -#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n" - -#define VTIMESMINUSI0d(A,DEST,Z) -#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n" -#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n" - -#define VACCTIMESI0f(A,ACC,tmp) -#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n" -#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n" - -#define VACCTIMESI0d(A,ACC,tmp) -#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n" -#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n" - -#define VACCTIMESMINUSI0f(A,ACC,tmp) -#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n" -#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n" - - // Acc = Acc - i A -#define VACCTIMESMINUSI0d(A,ACC,tmp) -#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n" -#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n" - -//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e -//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1 - -#define VPERM0f(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n" -#define VPERM1f(A,B) "vpermf32x4 $0xb1," #A "," #B ";\n" -#define VPERM2f(A,B) "vmovaps " #A "{badc}," #B ";\n" -#define VPERM3f(A,B) "vmovaps " #A "{cdab}," #B ";\n" - -#define VPERM0d(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n" -#define VPERM1d(A,B) "vmovapd " #A "{badc}," #B ";\n" -#define VPERM2d(A,B) "vmovapd " #A "{cdab}," #B ";\n" -#define VPERM3d(A,B) VMOVd(A,B) - -#endif - -// const SiteSpinor * ptr = & in._odata[offset]; -#define LOAD_CHIMU(PTR) LOAD_CHIMUi(PTR) -#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi ); -#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR) -#define SAVE_CHI(PTR) SAVE_CHIi(PTR) -#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR) - -#define LOAD_CHIMUi \ - LOAD_CHIMU01i \ - LOAD_CHIMU23i ); - - -#define LOAD_CHIMU01i\ - VLOAD(0,%r8,Chimu_00) \ - VLOAD(1,%r8,Chimu_01) \ - VLOAD(2,%r8,Chimu_02) \ - VLOAD(3,%r8,Chimu_10) \ - VLOAD(4,%r8,Chimu_11) \ - VLOAD(5,%r8,Chimu_12) - -#define LOAD_CHIMU23i\ - VLOAD(6,%r8,Chimu_20) \ - VLOAD(7,%r8,Chimu_21) \ - VLOAD(8,%r8,Chimu_22) \ - VLOAD(9,%r8,Chimu_30) \ - VLOAD(10,%r8,Chimu_31) \ - VLOAD(11,%r8,Chimu_32) - -#define SHUF_CHIMU23i\ - VSHUFMEM(6,%r8,Chimu_20) \ - VSHUFMEM(7,%r8,Chimu_21) \ - VSHUFMEM(8,%r8,Chimu_22) \ - VSHUFMEM(9,%r8,Chimu_30) \ - VSHUFMEM(10,%r8,Chimu_31) \ - VSHUFMEM(11,%r8,Chimu_32) - - -// const SiteHalfSpinor *ptr = &buf[offset]; - -#define LOAD_CHIi \ - VLOAD(0,%r8,Chi_00) \ - VLOAD(1,%r8,Chi_01) \ - VLOAD(2,%r8,Chi_02) \ - VLOAD(3,%r8,Chi_10) \ - VLOAD(4,%r8,Chi_11) \ - VLOAD(5,%r8,Chi_12) - - -#define SAVE_UCHIi(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ - VSTORE(0,%r8,UChi_00) \ - VSTORE(1,%r8,UChi_01) \ - VSTORE(2,%r8,UChi_02) \ - VSTORE(3,%r8,UChi_10) \ - VSTORE(4,%r8,UChi_11) \ - VSTORE(5,%r8,UChi_12) \ - ); - -#define SAVE_CHIi(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ - VSTORE(0,%r8,Chi_00) \ - VSTORE(1,%r8,Chi_01) \ - VSTORE(2,%r8,Chi_02) \ - VSTORE(3,%r8,Chi_10) \ - VSTORE(4,%r8,Chi_11) \ - VSTORE(5,%r8,Chi_12) \ - ); - -#define SAVE_RESULTi(PTR)\ - LOAD64(%r8,PTR) \ - __asm__ ( \ - VSTORE(0,%r8,result_00) \ - VSTORE(1,%r8,result_01) \ - VSTORE(2,%r8,result_02) \ - VSTORE(3,%r8,result_10) \ - VSTORE(4,%r8,result_11) \ - VSTORE(5,%r8,result_12) \ - VSTORE(6,%r8,result_20) \ - VSTORE(7,%r8,result_21) \ - VSTORE(8,%r8,result_22) \ - VSTORE(9,%r8,result_30) \ - VSTORE(10,%r8,result_31) \ - VSTORE(11,%r8,result_32) \ - ); - -// auto ptr = &U._odata[sU](A); -// A plan for lifting loads -// can use Z2/3/4/5/U/U for U field in first step. -// can use Chi_00, Chi_10, U U for U field in second step -// can use Chi_00, Chi_10, Chi_01,11, U U for U field in third step -// Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed. -// KNL is DUAL issue for FP, and lifting these loads is potentially important. -// Need detailed profile data to be sure. - -#define PREFETCH_U(A) \ - LOAD64(%r8,&U._odata[sU](A)) \ - __asm__ ( \ - VPREFETCHG(0,%r8) \ - VPREFETCHG(1,%r8) \ - VPREFETCHG(2,%r8) \ - VPREFETCHG(3,%r8) \ - VPREFETCHG(4,%r8) \ - VPREFETCHG(5,%r8) \ - VPREFETCHG(6,%r8) \ - VPREFETCHG(7,%r8) \ - VPREFETCHG(8,%r8) ); - -#define PREFETCH_R(A) \ - LOAD64(%r8,&out._odata[ss]) \ - __asm__ ( \ - VPREFETCHW(0,%r8) \ - VPREFETCHW(1,%r8) \ - VPREFETCHW(2,%r8) \ - VPREFETCHW(3,%r8) \ - VPREFETCHW(4,%r8) \ - VPREFETCHW(5,%r8) \ - VPREFETCHW(6,%r8) \ - VPREFETCHW(7,%r8) \ - VPREFETCHW(8,%r8) \ - VPREFETCHW(9,%r8) \ - VPREFETCHW(10,%r8) \ - VPREFETCHW(11,%r8) ); - - -#define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A)) - -#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p) -#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p) -#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p) -#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN_PFTP(&U._odata[sU](A),p) - -#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN_PFXM(&U._odata[sU](A),p) -#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN_PFYM(&U._odata[sU](A),p) -#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p) -#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p) - -#define MULT_2SPIN_UNOPT(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - ZLOAD (0,%r8,UChi_01,UChi_11) \ - ZLOAD (3,%r8,UChi_02,UChi_12) \ - ZLOAD (6,%r8,Uri,Uir) \ - ZMUL (UChi_01,UChi_11,Chi_00,UChi_00,Z0) \ - ZMUL (UChi_01,UChi_11,Chi_10,UChi_10,Z1) \ - ZMUL (UChi_02,UChi_12,Chi_00,UChi_01,Z2) \ - ZMUL (UChi_02,UChi_12,Chi_10,UChi_11,Z3) \ - ZMUL (Uri,Uir, Chi_00,UChi_02,Z4) \ - ZMUL (Uri,Uir, Chi_10,UChi_12,Z5) \ - \ - ZLOAD (1,%r8,Uri,Uir) \ - ZLOAD (4,%r8,Chi_00, Chi_10) \ - ZMADD (Uri,Uir, Chi_01,UChi_00,Z0) \ - ZMADD (Uri,Uir, Chi_11,UChi_10,Z1) \ - ZLOAD (7,%r8,Uri,Uir) \ - ZMADD (Chi_00, Chi_10,Chi_01,UChi_01,Z2) \ - ZMADD (Chi_00, Chi_10,Chi_11,UChi_11,Z3) \ - ZLOAD (2,%r8,Chi_00,Chi_10) \ - ZMADD(Uri,Uir, Chi_01,UChi_02,Z4) \ - ZMADD(Uri,Uir, Chi_11,UChi_12,Z5) \ - \ - ZLOAD (5,%r8,Uri,Uir) \ - ZMADD (Chi_00,Chi_10, Chi_02,UChi_00,Z0) \ - ZMADD (Chi_00,Chi_10, Chi_12,UChi_10,Z1) \ - ZLOAD (8,%r8,Chi_00,Chi_10) \ - ZMADD (Uri,Uir, Chi_02,UChi_01,Z2) \ - ZMADD (Uri,Uir, Chi_12,UChi_11,Z3) \ - ZMADD(Chi_00,Chi_10, Chi_02,UChi_02,Z4) \ - ZMADD(Chi_00,Chi_10, Chi_12,UChi_12,Z5) \ - \ - ZEND1(UChi_00,Z0,Chi_01) \ - ZEND1(UChi_10,Z1,Chi_11) \ - ZEND1(UChi_01,Z2,Chi_00) \ - ZEND1(UChi_11,Z3,Chi_10) \ - ZEND1(UChi_02,Z4,Chi_02) \ - ZEND1(UChi_12,Z5,Chi_12) \ - ZEND2(UChi_00,Z0,Chi_01) \ - ZEND2(UChi_10,Z1,Chi_11) \ - ZEND2(UChi_01,Z2,Chi_00) \ - ZEND2(UChi_11,Z3,Chi_10) \ - ZEND2(UChi_02,Z4,Chi_02) \ - ZEND2(UChi_12,Z5,Chi_12) ); - -#define MULT_2SPIN(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG); -#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) -#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) -#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) -#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) -#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) -#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) -#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCH) -#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) - - -#define MULT_2SPIN_PF(ptr,pf,VPF) \ - LOAD64(%r8,ptr) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - ZMULMEM2SP(0,%r8,Uri,Chi_00,Chi_10,UChi_00,Z0,UChi_10,Z1) \ - VPF(0,%r9) \ - ZMULMEM2SP(3,%r8,Uri,Chi_00,Chi_10,UChi_01,Z2,UChi_11,Z3) \ - VPF(1,%r9) \ - ZMULMEM2SP(6,%r8,Uri,Chi_00,Chi_10,UChi_02,Z4,UChi_12,Z5) \ - VPF(2,%r9) \ - \ - ZMADDMEM2SP(1,%r8,Uri,Chi_01,Chi_11,UChi_00,Z0,UChi_10,Z1) \ - VPF(3,%r9) \ - ZMADDMEM2SP(4,%r8,Uri,Chi_01,Chi_11,UChi_01,Z2,UChi_11,Z3) \ - VPF(4,%r9) \ - ZMADDMEM2SP(7,%r8,Uri,Chi_01,Chi_11,UChi_02,Z4,UChi_12,Z5) \ - VPF(5,%r9) \ - \ - ZMADDMEM2SP(2,%r8,Uri,Chi_02,Chi_12,UChi_00,Z0,UChi_10,Z1) \ - VPF(6,%r9) \ - ZMADDMEM2SP(5,%r8,Uri,Chi_02,Chi_12,UChi_01,Z2,UChi_11,Z3) \ - VPF(7,%r9) \ - ZMADDMEM2SP(8,%r8,Uri,Chi_02,Chi_12,UChi_02,Z4,UChi_12,Z5) \ - VPF(8,%r9) \ - \ - ZEND1(UChi_00,Z0,Chi_01) \ - ZEND1(UChi_10,Z1,Chi_11) \ - ZEND1(UChi_01,Z2,Chi_00) \ - ZEND1(UChi_11,Z3,Chi_10) \ - VPF(9,%r9) \ - ZEND1(UChi_02,Z4,Chi_02) \ - ZEND1(UChi_12,Z5,Chi_12) \ - ZEND2(UChi_00,Z0,Chi_01) \ - ZEND2(UChi_10,Z1,Chi_11) \ - VPF(10,%r9) \ - ZEND2(UChi_01,Z2,Chi_00) \ - ZEND2(UChi_11,Z3,Chi_10) \ - ZEND2(UChi_02,Z4,Chi_02) \ - VPF(11,%r9) \ - ZEND2(UChi_12,Z5,Chi_12) ); - - -#define MULT_2SPIN_PFNONE(ptr,pf,VPF) \ - LOAD64(%r8,ptr) \ - LOAD64(%r9,pf) \ - __asm__ ( \ - VPF(0,%r9) \ - VPF(1,%r9) \ - VPF(2,%r9) \ - \ - VPF(3,%r9) \ - VPF(4,%r9) \ - VPF(5,%r9) \ - \ - VPF(6,%r9) \ - VPF(7,%r9) \ - VPF(8,%r9) \ - \ - VPF(9,%r9) \ - VPF(10,%r9) \ - VPF(11,%r9) ); - - -// Pretty much Perfectly Pipelined - -////////////////////////////////////////////////////////////////// -// Dirac algebra -////////////////////////////////////////////////////////////////// - -// hspin(0)=fspin(0)+timesI(fspin(3)); -// hspin(1)=fspin(1)+timesI(fspin(2)); -#define XP_PROJMEM(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ - LOAD_CHIi \ - SHUF_CHIMU23i \ - VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \ - VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \ - VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \ - VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \ - VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \ - VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \ - VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \ - VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \ - VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \ - VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \ - VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \ - VACCTIMESI2(Chi_12,Chi_12,Chimu_22) ); - - -#define YP_PROJMEM(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - LOAD_CHIMU01i \ - VSUBMEM(9,%r8 ,Chimu_00,Chi_00) \ - VSUBMEM(10,%r8,Chimu_01,Chi_01) \ - VSUBMEM(11,%r8,Chimu_02,Chi_02) \ - VADDMEM(6,%r8,Chimu_10,Chi_10) \ - VADDMEM(7,%r8,Chimu_11,Chi_11) \ - VADDMEM(8,%r8,Chimu_12,Chi_12) ); - -#define ZP_PROJMEM(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ - LOAD_CHIi \ - SHUF_CHIMU23i \ - VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \ - VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \ - VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \ - VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \ - VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \ - VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \ - VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \ - VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \ - VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \ - VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30) \ - VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31) \ - VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32) ); - - -#define TP_PROJMEM(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - LOAD_CHIMU01i \ - VADDMEM(6,%r8 ,Chimu_00,Chi_00) \ - VADDMEM(7,%r8,Chimu_01,Chi_01) \ - VADDMEM(8,%r8,Chimu_02,Chi_02) \ - VADDMEM(9,%r8,Chimu_10,Chi_10) \ - VADDMEM(10,%r8,Chimu_11,Chi_11) \ - VADDMEM(11,%r8,Chimu_12,Chi_12) ); - -// hspin(0)=fspin(0)-timesI(fspin(3)) -// hspin(1)=fspin(1)-timesI(fspin(2)) - -#define XM_PROJMEM(PTR) \ - LOAD64(%r8,PTR)\ - __asm__ ( \ - SHUF_CHIMU23i \ - LOAD_CHIi \ - VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\ - VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\ - VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\ - VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\ - VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\ - VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\ - VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\ - VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\ - VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\ - VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\ - VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\ - VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) ); - -#define YM_PROJMEM(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - LOAD_CHIMU01i \ - VADDMEM(9,%r8 ,Chimu_00,Chi_00) \ - VADDMEM(10,%r8,Chimu_01,Chi_01) \ - VADDMEM(11,%r8,Chimu_02,Chi_02) \ - VSUBMEM(6,%r8,Chimu_10,Chi_10) \ - VSUBMEM(7,%r8,Chimu_11,Chi_11) \ - VSUBMEM(8,%r8,Chimu_12,Chi_12) ); - -#define ZM_PROJMEM(PTR) \ - LOAD64(%r8,PTR) \ - __asm__ ( \ - SHUF_CHIMU23i \ - LOAD_CHIi \ - VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\ - VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\ - VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\ - VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\ - VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\ - VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\ - VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\ - VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\ - VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\ - VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\ - VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\ - VACCTIMESI2(Chi_12,Chi_12,Chimu_32) ); - -#define TM_PROJMEM(ptr) \ - LOAD64(%r8,ptr) \ - __asm__ ( \ - LOAD_CHIMU01i \ - VSUBMEM(6,%r8 ,Chimu_00,Chi_00) \ - VSUBMEM(7,%r8,Chimu_01,Chi_01) \ - VSUBMEM(8,%r8,Chimu_02,Chi_02) \ - VSUBMEM(9,%r8,Chimu_10,Chi_10) \ - VSUBMEM(10,%r8,Chimu_11,Chi_11) \ - VSUBMEM(11,%r8,Chimu_12,Chi_12) ); - -// fspin(0)=hspin(0) -// fspin(1)=hspin(1) -// fspin(2)=timesMinusI(hspin(1)) -// fspin(3)=timesMinusI(hspin(0)) -#define XP_RECON __asm__ ( \ - VZERO(TMP) \ - VMOV(UChi_00,result_00) \ - VMOV(UChi_01,result_01) \ - VMOV(UChi_02,result_02) \ - VMOV(UChi_10,result_10) \ - VMOV(UChi_11,result_11) \ - VMOV(UChi_12,result_12) \ - VTIMESMINUSI0(UChi_10,result_20,TMP) \ - VTIMESMINUSI0(UChi_11,result_21,TMP) \ - VTIMESMINUSI0(UChi_12,result_22,TMP) \ - VTIMESMINUSI0(UChi_00,result_30,TMP) \ - VTIMESMINUSI0(UChi_01,result_31,TMP) \ - VTIMESMINUSI0(UChi_02,result_32,TMP) \ - VTIMESMINUSI1(UChi_10,result_20,TMP) \ - VTIMESMINUSI1(UChi_11,result_21,TMP) \ - VTIMESMINUSI1(UChi_12,result_22,TMP) \ - VTIMESMINUSI1(UChi_00,result_30,TMP) \ - VTIMESMINUSI1(UChi_01,result_31,TMP) \ - VTIMESMINUSI1(UChi_02,result_32,TMP) \ - VTIMESMINUSI2(UChi_10,result_20,TMP) \ - VTIMESMINUSI2(UChi_11,result_21,TMP) \ - VTIMESMINUSI2(UChi_12,result_22,TMP) \ - VTIMESMINUSI2(UChi_00,result_30,TMP) \ - VTIMESMINUSI2(UChi_01,result_31,TMP) \ - VTIMESMINUSI2(UChi_02,result_32,TMP) \ - ); - // NB could save 6 ops using addsub => 12 cycles -#define XP_RECON_ACCUM __asm__ ( \ - VZERO(TMP)\ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_12,result_12,result_12)\ - VACCTIMESMINUSI0(UChi_10,result_20,Z0)\ - VACCTIMESMINUSI0(UChi_11,result_21,Z1)\ - VACCTIMESMINUSI0(UChi_12,result_22,Z2)\ - VACCTIMESMINUSI0(UChi_00,result_30,Z3)\ - VACCTIMESMINUSI0(UChi_01,result_31,Z4)\ - VACCTIMESMINUSI0(UChi_02,result_32,Z5)\ - VACCTIMESMINUSI1(UChi_10,result_20,Z0)\ - VACCTIMESMINUSI1(UChi_11,result_21,Z1)\ - VACCTIMESMINUSI1(UChi_12,result_22,Z2)\ - VACCTIMESMINUSI1(UChi_00,result_30,Z3)\ - VACCTIMESMINUSI1(UChi_01,result_31,Z4)\ - VACCTIMESMINUSI1(UChi_02,result_32,Z5)\ - VACCTIMESMINUSI2(UChi_10,result_20,Z0)\ - VACCTIMESMINUSI2(UChi_11,result_21,Z1)\ - VACCTIMESMINUSI2(UChi_12,result_22,Z2)\ - VACCTIMESMINUSI2(UChi_00,result_30,Z3)\ - VACCTIMESMINUSI2(UChi_01,result_31,Z4)\ - VACCTIMESMINUSI2(UChi_02,result_32,Z5)\ - ); - -#define XM_RECON __asm__ ( \ - VZERO(TMP)\ - VMOV(UChi_00,result_00)\ - VMOV(UChi_01,result_01)\ - VMOV(UChi_02,result_02)\ - VMOV(UChi_10,result_10)\ - VMOV(UChi_11,result_11)\ - VMOV(UChi_12,result_12)\ - VTIMESI0(UChi_10,result_20,TMP)\ - VTIMESI0(UChi_11,result_21,TMP)\ - VTIMESI0(UChi_12,result_22,TMP)\ - VTIMESI0(UChi_00,result_30,TMP)\ - VTIMESI0(UChi_01,result_31,TMP)\ - VTIMESI0(UChi_02,result_32,TMP)\ - VTIMESI1(UChi_10,result_20,TMP)\ - VTIMESI1(UChi_11,result_21,TMP)\ - VTIMESI1(UChi_12,result_22,TMP)\ - VTIMESI1(UChi_00,result_30,TMP)\ - VTIMESI1(UChi_01,result_31,TMP)\ - VTIMESI1(UChi_02,result_32,TMP)\ - VTIMESI2(UChi_10,result_20,TMP)\ - VTIMESI2(UChi_11,result_21,TMP)\ - VTIMESI2(UChi_12,result_22,TMP)\ - VTIMESI2(UChi_00,result_30,TMP)\ - VTIMESI2(UChi_01,result_31,TMP)\ - VTIMESI2(UChi_02,result_32,TMP)\ - ); - -#define XM_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_12,result_12,result_12)\ - VACCTIMESI0(UChi_10,result_20,Z0)\ - VACCTIMESI0(UChi_11,result_21,Z1)\ - VACCTIMESI0(UChi_12,result_22,Z2)\ - VACCTIMESI0(UChi_00,result_30,Z3)\ - VACCTIMESI0(UChi_01,result_31,Z4)\ - VACCTIMESI0(UChi_02,result_32,Z5)\ - VACCTIMESI1(UChi_10,result_20,Z0)\ - VACCTIMESI1(UChi_11,result_21,Z1)\ - VACCTIMESI1(UChi_12,result_22,Z2)\ - VACCTIMESI1(UChi_00,result_30,Z3)\ - VACCTIMESI1(UChi_01,result_31,Z4)\ - VACCTIMESI1(UChi_02,result_32,Z5)\ - VACCTIMESI2(UChi_10,result_20,Z0)\ - VACCTIMESI2(UChi_11,result_21,Z1)\ - VACCTIMESI2(UChi_12,result_22,Z2)\ - VACCTIMESI2(UChi_00,result_30,Z3)\ - VACCTIMESI2(UChi_01,result_31,Z4)\ - VACCTIMESI2(UChi_02,result_32,Z5)\ - ); - -#define YP_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_12,result_12,result_12)\ - VADD(UChi_10,result_20,result_20)\ - VADD(UChi_11,result_21,result_21)\ - VADD(UChi_12,result_22,result_22)\ - VSUB(UChi_00,result_30,result_30)\ - VSUB(UChi_01,result_31,result_31)\ - VSUB(UChi_02,result_32,result_32) ); - -#define YM_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_12,result_12,result_12)\ - VSUB(UChi_10,result_20,result_20)\ - VSUB(UChi_11,result_21,result_21)\ - VSUB(UChi_12,result_22,result_22)\ - VADD(UChi_00,result_30,result_30)\ - VADD(UChi_01,result_31,result_31)\ - VADD(UChi_02,result_32,result_32) ); - -#define ZP_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_12,result_12,result_12)\ - VACCTIMESMINUSI0(UChi_00,result_20,Z0)\ - VACCTIMESMINUSI0(UChi_01,result_21,Z1)\ - VACCTIMESMINUSI0(UChi_02,result_22,Z2)\ - VACCTIMESI0(UChi_10,result_30,Z3)\ - VACCTIMESI0(UChi_11,result_31,Z4)\ - VACCTIMESI0(UChi_12,result_32,Z5)\ - VACCTIMESMINUSI1(UChi_00,result_20,Z0)\ - VACCTIMESMINUSI1(UChi_01,result_21,Z1)\ - VACCTIMESMINUSI1(UChi_02,result_22,Z2)\ - VACCTIMESI1(UChi_10,result_30,Z3)\ - VACCTIMESI1(UChi_11,result_31,Z4)\ - VACCTIMESI1(UChi_12,result_32,Z5)\ - VACCTIMESMINUSI2(UChi_00,result_20,Z0)\ - VACCTIMESMINUSI2(UChi_01,result_21,Z1)\ - VACCTIMESMINUSI2(UChi_02,result_22,Z2)\ - VACCTIMESI2(UChi_10,result_30,Z3)\ - VACCTIMESI2(UChi_11,result_31,Z4)\ - VACCTIMESI2(UChi_12,result_32,Z5)\ - ); - -#define ZM_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_12,result_12,result_12)\ - VACCTIMESI0(UChi_00,result_20,Z0)\ - VACCTIMESI0(UChi_01,result_21,Z1)\ - VACCTIMESI0(UChi_02,result_22,Z2)\ - VACCTIMESMINUSI0(UChi_10,result_30,Z3)\ - VACCTIMESMINUSI0(UChi_11,result_31,Z4)\ - VACCTIMESMINUSI0(UChi_12,result_32,Z5)\ - VACCTIMESI1(UChi_00,result_20,Z0)\ - VACCTIMESI1(UChi_01,result_21,Z1)\ - VACCTIMESI1(UChi_02,result_22,Z2)\ - VACCTIMESMINUSI1(UChi_10,result_30,Z3)\ - VACCTIMESMINUSI1(UChi_11,result_31,Z4)\ - VACCTIMESMINUSI1(UChi_12,result_32,Z5)\ - VACCTIMESI2(UChi_00,result_20,Z0)\ - VACCTIMESI2(UChi_01,result_21,Z1)\ - VACCTIMESI2(UChi_02,result_22,Z2)\ - VACCTIMESMINUSI2(UChi_10,result_30,Z3)\ - VACCTIMESMINUSI2(UChi_11,result_31,Z4)\ - VACCTIMESMINUSI2(UChi_12,result_32,Z5)\ - ); - -#define TP_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_12,result_12,result_12)\ - VADD(UChi_00,result_20,result_20)\ - VADD(UChi_01,result_21,result_21)\ - VADD(UChi_02,result_22,result_22)\ - VADD(UChi_10,result_30,result_30)\ - VADD(UChi_11,result_31,result_31)\ - VADD(UChi_12,result_32,result_32) ); - -#define TM_RECON_ACCUM __asm__ ( \ - VADD(UChi_00,result_00,result_00)\ - VADD(UChi_01,result_01,result_01)\ - VADD(UChi_02,result_02,result_02)\ - VADD(UChi_10,result_10,result_10)\ - VADD(UChi_11,result_11,result_11)\ - VADD(UChi_12,result_12,result_12)\ - VSUB(UChi_00,result_20,result_20)\ - VSUB(UChi_01,result_21,result_21)\ - VSUB(UChi_02,result_22,result_22)\ - VSUB(UChi_10,result_30,result_30)\ - VSUB(UChi_11,result_31,result_31)\ - VSUB(UChi_12,result_32,result_32) ); - -#define PREFETCH_CHIMU(A) - -#define PERMUTE_DIR0 __asm__ ( \ - VPERM0(Chi_00,Chi_00) \ - VPERM0(Chi_01,Chi_01) \ - VPERM0(Chi_02,Chi_02) \ - VPERM0(Chi_10,Chi_10) \ - VPERM0(Chi_11,Chi_11) \ - VPERM0(Chi_12,Chi_12) ); - -#define PERMUTE_DIR1 __asm__ ( \ - VPERM1(Chi_00,Chi_00) \ - VPERM1(Chi_01,Chi_01) \ - VPERM1(Chi_02,Chi_02) \ - VPERM1(Chi_10,Chi_10) \ - VPERM1(Chi_11,Chi_11) \ - VPERM1(Chi_12,Chi_12)); - -#define PERMUTE_DIR2 __asm__ ( \ - VPERM2(Chi_00,Chi_00) \ - VPERM2(Chi_01,Chi_01) \ - VPERM2(Chi_02,Chi_02) \ - VPERM2(Chi_10,Chi_10) \ - VPERM2(Chi_11,Chi_11) \ - VPERM2(Chi_12,Chi_12) ); - -#define PERMUTE_DIR3 __asm__ ( \ - VPERM3(Chi_00,Chi_00) \ - VPERM3(Chi_01,Chi_01) \ - VPERM3(Chi_02,Chi_02) \ - VPERM3(Chi_10,Chi_10) \ - VPERM3(Chi_11,Chi_11) \ - VPERM3(Chi_12,Chi_12) ); - -#endif diff --git a/lib/simd/Intel512avx.h b/lib/simd/Intel512avx.h index 102a962f..506a0b1d 100644 --- a/lib/simd/Intel512avx.h +++ b/lib/simd/Intel512avx.h @@ -84,6 +84,16 @@ Author: paboyle #define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\ "vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii + +#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2 +#define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2 + +#define VMOVRDUPf(OFF,PTR,DEST) "vmovsldup " #OFF "*64(" #PTR "), " #DEST ";\n" +#define VMOVIDUPf(OFF,PTR,DEST) "vmovshdup " #OFF "*64(" #PTR "), " #DEST ";\n" + +#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n" +#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n" + #define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST) #define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n" diff --git a/lib/simd/Intel512avxAddsub.h b/lib/simd/Intel512avxAddsub.h index 6741170e..e7c4c175 100644 --- a/lib/simd/Intel512avxAddsub.h +++ b/lib/simd/Intel512avxAddsub.h @@ -28,18 +28,6 @@ Author: paboyle #ifndef GRID_ASM_AV512_ADDSUB_H #define GRID_ASM_AV512_ADDSUB_H -//////////////////////////////////////////////////////////// -// Knights Landing specials -//////////////////////////////////////////////////////////// - -#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 1,0,3,2 -#define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 3,2,3,2 - -#define VMOVRDUPf(O,P,DEST) "vmovsldup " #OFF "*64(" #PTR "), " #DEST ";\n" -#define VMOVIDUPf(O,P,DEST) "vmovshdup " #OFF "*64(" #PTR "), " #DEST ";\n" - -#define VMADDSUBf(Aii,Bri,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n" -#define VMADDSUBd(Aii,Bri,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n" //////////////////////////////////////////////////////////////// // Building blocks for SU3 x 2spinor @@ -48,7 +36,7 @@ Author: paboyle // 6 Chi shuffles ir,ri // 6muls, 30 fmaddsubs //////////////////////////////////////////////////////////////// -#define MULT_ADDSUB_2SPIN_PF(ptr) \ +#define MULT_ADDSUB_2SPIN(ptr) \ LOAD64(%r8,ptr) \ __asm__ ( \ VMOVIDUPf(0,%r8,Z0 ) \ diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 810f93f6..9c746a39 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -66,6 +66,8 @@ Author: paboyle #define Uir %zmm24 //#define ONE %zmm24 #define Uri %zmm25 +#define T1 %zmm24 +#define T2 %zmm25 #define Z0 %zmm26 #define Z1 %zmm27 @@ -288,7 +290,9 @@ Author: paboyle ZEND2(UChi_02,Z4,Chi_02) \ ZEND2(UChi_12,Z5,Chi_12) ); -#define MULT_2SPIN(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG); +#define MULT_2SPINa(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG); +#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr); + #define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) #define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) #define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA) @@ -750,8 +754,63 @@ Author: paboyle VPERM3(Chi_11,Chi_11) \ VPERM3(Chi_12,Chi_12) ); -#ifdef AVX512 -#include -#endif +#define MULT_ADDSUB_2SPIN1(ptr) \ + LOAD64(%r8,ptr) +/* + * __asm__ ( \ +); + VMUL(Z0,%zmm2,%zmm3) \ +*/ +#define MULT_ADDSUB_2SPIN(ptr) \ + LOAD64(%r8,ptr) \ + __asm__ ( \ + VMOVIDUP(0,%r8,Z0 ) \ + VMOVIDUP(3,%r8,Z1 )\ + VMOVIDUP(6,%r8,Z2 )\ + VSHUF(Chi_00,T1) \ + VSHUF(Chi_10,T2) \ + \ + VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \ + VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \ + VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \ + VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \ + VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \ + VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \ + \ + VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \ + VMADDSUB(Z3,Chi_10,UChi_10) VSHUF(Chi_11,T2) \ + VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \ + VMADDSUB(Z4,Chi_10,UChi_11)\ + VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \ + VMADDSUB(Z5,Chi_10,UChi_12)\ + \ + VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \ + VMADDSUB(Z0,T2,UChi_10)\ + VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \ + VMADDSUB(Z1,T2,UChi_11)\ + VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \ + VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \ + \ + VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \ + VMADDSUB(Z3,Chi_11,UChi_10) VSHUF(Chi_12,T2) \ + VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \ + VMADDSUB(Z4,Chi_11,UChi_11)\ + VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \ + VMADDSUB(Z5,Chi_11,UChi_12)\ + \ + VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \ + VMADDSUB(Z0,T2,UChi_10)\ + VMADDSUB(Z1,T1,UChi_01)\ + VMADDSUB(Z1,T2,UChi_11)\ + VMADDSUB(Z2,T1,UChi_02)\ + VMADDSUB(Z2,T2,UChi_12)\ + \ + VMADDSUB(Z3,Chi_02,UChi_00)\ + VMADDSUB(Z3,Chi_12,UChi_10)\ + VMADDSUB(Z4,Chi_02,UChi_01)\ + VMADDSUB(Z4,Chi_12,UChi_11)\ + VMADDSUB(Z5,Chi_02,UChi_02)\ + VMADDSUB(Z5,Chi_12,UChi_12)\ + ); #endif diff --git a/tests/Make.inc b/tests/Make.inc index f7c83671..ef9681cd 100644 --- a/tests/Make.inc +++ b/tests/Make.inc @@ -1,13 +1,5 @@ -bin_PROGRAMS = Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd - - -Test_GaugeAction_SOURCES=Test_GaugeAction.cc -Test_GaugeAction_LDADD=-lGrid - - -Test_RectPlaq_SOURCES=Test_RectPlaq.cc -Test_RectPlaq_LDADD=-lGrid +bin_PROGRAMS = Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd Test_cayley_cg_SOURCES=Test_cayley_cg.cc @@ -102,8 +94,8 @@ Test_gamma_SOURCES=Test_gamma.cc Test_gamma_LDADD=-lGrid -Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc -Test_gp_rect_force_LDADD=-lGrid +Test_GaugeAction_SOURCES=Test_GaugeAction.cc +Test_GaugeAction_LDADD=-lGrid Test_gparity_SOURCES=Test_gparity.cc @@ -114,6 +106,10 @@ Test_gpdwf_force_SOURCES=Test_gpdwf_force.cc Test_gpdwf_force_LDADD=-lGrid +Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc +Test_gp_rect_force_LDADD=-lGrid + + Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc Test_gpwilson_even_odd_LDADD=-lGrid @@ -190,6 +186,10 @@ Test_rect_force_SOURCES=Test_rect_force.cc Test_rect_force_LDADD=-lGrid +Test_RectPlaq_SOURCES=Test_RectPlaq.cc +Test_RectPlaq_LDADD=-lGrid + + Test_remez_SOURCES=Test_remez.cc Test_remez_LDADD=-lGrid diff --git a/tests/Test_zmm.cc b/tests/Test_zmm.cc index 63476d5c..0417ce8d 100644 --- a/tests/Test_zmm.cc +++ b/tests/Test_zmm.cc @@ -27,7 +27,7 @@ Author: paboyle /* END LEGAL */ #include #include -#include +#include using namespace Grid; @@ -261,6 +261,10 @@ int main(int argc,char **argv) #undef ZLOAD #undef ZMUL #undef ZMADD +#undef VMOVIDUP +#undef VMOVRDUP +#undef VMADDSUB +#undef VSHUF #define VZERO(A) VZEROd(A) #define VTIMESI(A,B,C) VTIMESId(A,B,C) @@ -268,15 +272,19 @@ int main(int argc,char **argv) #define VLOAD(OFF,PTR,DEST) VLOADd(OFF,PTR,DEST) #define VSTORE(OFF,PTR,SRC) VSTOREd(OFF,PTR,SRC) -#define VMUL(Uri,Uir,Chi,UChi,Z) VMULd(Uri,Uir,Chi,UChi,Z) -#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDd(Uri,Uir,Chi,UChi,Z) -#define ZEND1(A,B,C) ZEND1d(A,B,C) -#define ZEND2(A,B,C) ZEND2d(A,B,C) +#define VMUL(Uri,Uir,Chi) VMULd(Uri,Uir,Chi) +#define VMADD(Uri,Uir,Chi) VMADDd(Uri,Uir,Chi) +#define ZEND1(A,B,C) ZEND1d(A,B,C) +#define ZEND2(A,B,C) ZEND2d(A,B,C) #define ZLOAD(A,B,C,D) ZLOADd(A,B,C,D) #define ZMUL(A,B,C,D,E) ZMULd(A,B,C,D,E) #define ZMADD(A,B,C,D,E) ZMADDd(A,B,C,D,E) #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) +#define VMOVIDUP(A,B,C) VMOVIDUPd(A,B,C) +#define VMOVRDUP(A,B,C) VMOVRDUPd(A,B,C) +#define VMADDSUB(A,B,accum) VMADDSUBd(A,B,accum) +#define VSHUF(A,B) VSHUFd(A,B) #define zz Z0 @@ -424,17 +432,21 @@ void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3) #undef VTIMESMINUSI #undef ZMULMEM2SP #undef ZMADDMEM2SP +#undef VMOVIDUP +#undef VMOVRDUP +#undef VMADDSUB +#undef VSHUF + #define VZERO(A) VZEROf(A) #define VMOV(A,B) VMOVf(A,B) #define VADD(A,B,C) VADDf(A,B,C) #define VSUB(A,B,C) VSUBf(A,B,C) #define VTIMESI(A,B,C) VTIMESIf(A,B,C) #define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C) - #define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST) #define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC) -#define VMUL(Uri,Uir,Chi,UChi,Z) VMULf(Uri,Uir,Chi,UChi,Z) -#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z) +#define VMUL(Uri,Uir,Chi) VMULf(Uri,Uir,Chi) +#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi) #define ZEND1(A,B,C) ZEND1f(A,B,C) #define ZEND2(A,B,C) ZEND2f(A,B,C) #define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D) @@ -442,6 +454,10 @@ void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3) #define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E) #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) +#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C) +#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C) +#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum) +#define VSHUF(A,B) VSHUFf(A,B) void ZmulF(void *ptr1,void *ptr2,void *ptr3) { @@ -528,7 +544,8 @@ void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3) LOAD_CHI(ptr1); - MULT_2SPIN(ptr2); + MULT_ADDSUB_2SPIN(ptr2); + //MULT_2SPIN(ptr2); SAVE_UCHI(ptr3);