From 4e65ad21aca35ab4d2be71c52e00085fe59284f1 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 4 Nov 2015 03:15:08 -0800 Subject: [PATCH] Adding a routine for AVX512 / IMCI with explicit assembly implementations --- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 314 +++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 lib/qcd/action/fermion/WilsonKernelsAsm.cc diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc new file mode 100644 index 00000000..9ccd18c6 --- /dev/null +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -0,0 +1,314 @@ +#include +#include + +#undef VLOAD +#undef VSTORE +#undef VMUL +#undef VMADD +#undef ZEND +#undef ZLOAD +#undef ZMUL +#undef ZMADD +#undef VZERO +#undef VTIMESI +#undef VTIMESMINUSI + +#define VZERO(A) VZEROf(A) +#define VMOV(A,B) VMOVf(A,B) +#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST) +#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC) + +#define VADD(A,B,C) VADDf(A,B,C) +#define VSUB(A,B,C) VSUBf(A,B,C) +#define VMUL(Uri,Uir,Chi,UChi,Z) VMULf(Uri,Uir,Chi,UChi,Z) +#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z) + +#define VTIMESI(A,B,C) VTIMESIf(A,B,C) +#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C) +#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C) +#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C) + +#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C) +#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C) +#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C) +#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C) + +#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C) +#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C) +#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C) +#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C) + +#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C) +#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C) +#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C) +#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C) + +#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P) +#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P) +#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P) +#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P) + +#define VPERM0(A,B) VPERM0f(A,B) +#define VPERM1(A,B) VPERM1f(A,B) +#define VPERM2(A,B) VPERM2f(A,B) +#define VPERM3(A,B) VPERM3f(A,B) +#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST) + +#define ZEND1(A,B,C) ZEND1f(A,B,C) +#define ZEND2(A,B,C) ZEND2f(A,B,C) +#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D) +#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E) +#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E) + +#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E) +#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E) + +#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C) +#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C) + +#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) +#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) + +namespace Grid { +namespace QCD { + +template +void WilsonKernels::DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U, + std::vector > &buf, + int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers) +{ + uint64_t now; + uint64_t first ; + int offset,local,perm, ptype; + const SiteHalfSpinor *pbuf = & buf[0]; + const SiteSpinor *plocal = & in._odata[0]; + void *pf; + int osites = in._grid->oSites(); + + + StencilEntry *SE; + + //#define STAMP(i) timers[i] = __rdtsc() ; +#define STAMP(i) //timers[i] = __rdtsc() ; + + MASK_REGS; + + first = __rdtsc(); + + SE=st.GetEntry(ptype,Xm,ss); + +#if 0 + if (SE->_is_local) pf=(void *)&plocal[SE->_offset]; + else pf=(void *)&pbuf[SE->_offset]; + + LOAD64(%r9,pf); + __asm__( + VPREFETCH(0,%r9) + VPREFETCH(1,%r9) + VPREFETCH(2,%r9) + VPREFETCH(3,%r9) + VPREFETCH(4,%r9) + VPREFETCH(5,%r9) + VPREFETCH(6,%r9) + VPREFETCH(7,%r9) + VPREFETCH(8,%r9) + VPREFETCH(9,%r9) + VPREFETCH(10,%r9) + VPREFETCH(11,%r9) ); +#endif + + // Xm + offset = SE->_offset; + local = SE->_is_local; + perm = SE->_permute; + + // Prefetch + SE=st.GetEntry(ptype,Ym,ss); + if (SE->_is_local) pf=(void *)&plocal[SE->_offset]; + else pf=(void *)&pbuf[SE->_offset]; + + if ( local ) { + XM_PROJMEM(&plocal[offset]); + if ( perm) { + PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + } + } else { + LOAD_CHI(&pbuf[offset]); + } + { + MULT_2SPIN_DIR_PFXM(Xm,pf); + } + XM_RECON; + + // Ym + offset = SE->_offset; + local = SE->_is_local; + perm = SE->_permute; + + // Prefetch + SE=st.GetEntry(ptype,Zm,ss); + if (SE->_is_local) pf=(void *)&plocal[SE->_offset]; + else pf=(void *)&pbuf[SE->_offset]; + + if ( local ) { + YM_PROJMEM(&plocal[offset]); + if ( perm) { + PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + } + } else { + LOAD_CHI(&pbuf[offset]); + } + { + MULT_2SPIN_DIR_PFYM(Ym,pf); + } + YM_RECON_ACCUM; + + // Zm + offset = SE->_offset; + local = SE->_is_local; + perm = SE->_permute; + + // Prefetch + SE=st.GetEntry(ptype,Tm,ss); + if (SE->_is_local) pf=(void *)&plocal[SE->_offset]; + else pf=(void *)&pbuf[SE->_offset]; + + if ( local ) { + ZM_PROJMEM(&plocal[offset]); + if ( perm) { + PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + } + } else { + LOAD_CHI(&pbuf[offset]); + } + { + MULT_2SPIN_DIR_PFZM(Zm,pf); + } + ZM_RECON_ACCUM; + + // Tm + offset = SE->_offset; + local = SE->_is_local; + perm = SE->_permute; + + SE=st.GetEntry(ptype,Tp,ss); + if (SE->_is_local) pf=(void *)&plocal[SE->_offset]; + else pf=(void *)&pbuf[SE->_offset]; + + + if ( local ) { + TM_PROJMEM(&plocal[offset]); + if ( perm) { + PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + } + } else { + LOAD_CHI(&pbuf[offset]); + } + { + MULT_2SPIN_DIR_PFTM(Tm,pf); + } + TM_RECON_ACCUM; + + // Tp + offset = SE->_offset; + local = SE->_is_local; + perm = SE->_permute; + + // Prefetch + SE=st.GetEntry(ptype,Zp,ss); + if (SE->_is_local) pf=(void *)&plocal[SE->_offset]; + else pf=(void *)&pbuf[SE->_offset]; + + if ( local ) { + TP_PROJMEM(&plocal[offset]); + if ( perm) { + PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + } + } else { + LOAD_CHI(&pbuf[offset]); + } + { + MULT_2SPIN_DIR_PFTP(Tp,pf); + } + TP_RECON_ACCUM; + + // Zp + offset = SE->_offset; + local = SE->_is_local; + perm = SE->_permute; + + // Prefetch + SE=st.GetEntry(ptype,Yp,ss); + if (SE->_is_local) pf=(void *)&plocal[SE->_offset]; + else pf=(void *)&pbuf[SE->_offset]; + + if ( local ) { + ZP_PROJMEM(&plocal[offset]); + if ( perm) { + PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + } + } else { + LOAD_CHI(&pbuf[offset]); + } + { + MULT_2SPIN_DIR_PFZP(Zp,pf); + } + ZP_RECON_ACCUM; + + + offset = SE->_offset; + local = SE->_is_local; + perm = SE->_permute; + + // Prefetch + SE=st.GetEntry(ptype,Xp,ss); + if (SE->_is_local) pf=(void *)&plocal[SE->_offset]; + else pf=(void *)&pbuf[SE->_offset]; + + if ( local ) { + YP_PROJMEM(&plocal[offset]); + if ( perm) { + PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + } + } else { + LOAD_CHI(&pbuf[offset]); + } + { + MULT_2SPIN_DIR_PFYP(Yp,pf); + } + YP_RECON_ACCUM; + + // Xp + perm = SE->_permute; + offset = SE->_offset; + local = SE->_is_local; + + // PREFETCH_R(A); + + // Prefetch + SE=st.GetEntry(ptype,Xm,(ss+1)%osites); + if (SE->_is_local) pf=(void *)&plocal[SE->_offset]; + else pf=(void *)&pbuf[SE->_offset]; + + if ( local ) { + XP_PROJMEM(&plocal[offset]); + if ( perm) { + PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + } + } else { + LOAD_CHI(&pbuf[offset]); + } + { + MULT_2SPIN_DIR_PFXP(Xp,pf); + } + XP_RECON_ACCUM; + + debug: + SAVE_RESULT(&out._odata[ss]); + +} + + template class WilsonKernels; + template class WilsonKernels; + +}}