From bf7e3f20d4f7bc26439ef5c310495b67bb9d2fdc Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Tue, 21 Feb 2017 14:35:42 +0000 Subject: [PATCH] Staggaered fermion optimised version --- .../fermion/ImprovedStaggeredFermion5D.cc | 29 +- lib/qcd/action/fermion/StaggeredKernels.cc | 64 ++-- lib/qcd/action/fermion/StaggeredKernels.h | 14 +- lib/qcd/action/fermion/StaggeredKernelsAsm.cc | 294 +++++++++--------- .../action/fermion/StaggeredKernelsHand.cc | 27 +- tests/core/Test_staggered5D.cc | 2 +- tests/core/Test_staggered5Dvec.cc | 40 ++- 7 files changed, 273 insertions(+), 197 deletions(-) diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index 0455df0d..7068fc3f 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -237,19 +237,32 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr if (dag == DaggerYes) { PARALLEL_FOR_LOOP for (int ss = 0; ss < U._grid->oSites(); ss++) { - for(int s=0;soSites(); ss++) { - for(int s=0;soSites(); + int me,mywork,myoff; + GridThread::GetWorkBarrier(len,me, mywork,myoff); + for (int ss = myoff; ss < myoff+mywork; ss++) { + int sU=ss; + int sF=LLs*sU; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); + } + GridThread::ThreadBarrier(); + } + } +#endif } } diff --git a/lib/qcd/action/fermion/StaggeredKernels.cc b/lib/qcd/action/fermion/StaggeredKernels.cc index 06720c64..597b14ea 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.cc +++ b/lib/qcd/action/fermion/StaggeredKernels.cc @@ -182,65 +182,81 @@ void StaggeredKernels::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, D vstream(out, Uchi); }; -// Need controls to do interior, exterior, or both template void StaggeredKernels::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, FermionField &out) { + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out) { + int dag(1); SiteSpinor naik; SiteSpinor naive; int oneLink =0; int threeLink=1; + Real scale; + if(dag) scale = -1.0; + else scale = 1.0; + switch(Opt) { #ifdef AVX512 case OptInlineAsm: - DhopSiteAsm(st,lo,U,UUU,buf,sF,sU,in,out._odata[sF]); + DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out); break; #endif case OptHandUnroll: - DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink); - DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink); - out._odata[sF] =-naive-naik; + DhopSiteDepthHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag); break; case OptGeneric: - DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink); - DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink); - out._odata[sF] =-naive-naik; + for(int s=0;s void StaggeredKernels::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, - SiteSpinor *buf, int sF, + SiteSpinor *buf, int LLs, int sU, const FermionField &in, FermionField &out) { - int oneLink =0; - int threeLink=1; - SiteSpinor naik; - SiteSpinor naive; - static int once; + + int dag(0); + + int oneLink =0; + int threeLink=1; + SiteSpinor naik; + SiteSpinor naive; + static int once; + int sF=LLs*sU; + switch(Opt) { #ifdef AVX512 case OptInlineAsm: - DhopSiteAsm(st,lo,U,UUU,buf,sF,sU,in,out._odata[sF]); + DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out); break; #endif case OptHandUnroll: - DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink); - DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink); - out._odata[sF] =naive+naik; - break; + DhopSiteDepthHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag); + break; case OptGeneric: - DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink); - DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink); - out._odata[sF] =naive+naik; + + for(int s=0;s diff --git a/lib/qcd/action/fermion/StaggeredKernels.h b/lib/qcd/action/fermion/StaggeredKernels.h index e4cc8cdd..dc91a30c 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.h +++ b/lib/qcd/action/fermion/StaggeredKernels.h @@ -56,17 +56,21 @@ public: void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink); - void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, - int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink); + + void DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, + int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink); + + void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf, + int Lls, int sU, const FermionField &in, FermionField &out, int dag); void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf, - int sF, int sU, const FermionField &in, SiteSpinor &out); + int LLs, int sU, const FermionField &in, FermionField &out); void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf, int sF, int sU, const FermionField &in, FermionField &out); - void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out); + void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf, + int LLs, int sU, const FermionField &in, FermionField &out); public: diff --git a/lib/qcd/action/fermion/StaggeredKernelsAsm.cc b/lib/qcd/action/fermion/StaggeredKernelsAsm.cc index 890cf4e5..7f3624a1 100644 --- a/lib/qcd/action/fermion/StaggeredKernelsAsm.cc +++ b/lib/qcd/action/fermion/StaggeredKernelsAsm.cc @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -507,13 +507,37 @@ Author: paboyle VLOAD(2,%%r8,pChi_12) \ : : "r" (a1) : "%r8" ); -#define PF_CHI(a0) \ +#define PF_CHI(a0) +#define PF_CHIa(a0) \ asm ( \ "movq %0, %%r8 \n\t" \ VPREFETCH1(0,%%r8) \ VPREFETCH1(1,%%r8) \ VPREFETCH1(2,%%r8) \ : : "r" (a0) : "%r8" ); \ + +#define PF_GAUGE_XYZT(a0) +#define PF_GAUGE_XYZTa(a0) \ + asm ( \ + "movq %0, %%r8 \n\t" \ + VPREFETCH1(0,%%r8) \ + VPREFETCH1(1,%%r8) \ + VPREFETCH1(2,%%r8) \ + VPREFETCH1(3,%%r8) \ + VPREFETCH1(4,%%r8) \ + VPREFETCH1(5,%%r8) \ + VPREFETCH1(6,%%r8) \ + VPREFETCH1(7,%%r8) \ + VPREFETCH1(8,%%r8) \ + : : "r" (a0) : "%r8" ); \ + +#define PF_GAUGE_LS(a0) +#define PF_GAUGE_LSa(a0) \ + asm ( \ + "movq %0, %%r8 \n\t" \ + VPREFETCH1(0,%%r8) \ + VPREFETCH1(1,%%r8) \ + : : "r" (a0) : "%r8" ); \ #define REDUCE(out) \ @@ -556,40 +580,59 @@ template void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out) + SiteSpinor *buf, int LLs, + int sU, const FermionField &in, FermionField &out) { assert(0); -} +}; -#define PREPARE(X,Y,Z,T,skew,UU) \ - SE0=st.GetEntry(ptype,X+skew,sF); \ - o0 = SE0->_offset; \ - l0 = SE0->_is_local; \ - p0 = SE0->_permute; \ - addr0 = l0 ? (uint64_t) &in._odata[o0] : (uint64_t) &buf[o0]; \ + +//#define CONDITIONAL_MOVE(l,o,out) if ( l ) { out = (uint64_t) &in._odata[o] ; } else { out =(uint64_t) &buf[o]; } + +#define CONDITIONAL_MOVE(l,o,out) { const SiteSpinor *ptr = l? in_p : buf; out = (uint64_t) &ptr[o]; } + +#define PREPARE_XYZT(X,Y,Z,T,skew,UU) \ + PREPARE(X,Y,Z,T,skew,UU); \ + PF_GAUGE_XYZT(gauge0); \ + PF_GAUGE_XYZT(gauge1); \ + PF_GAUGE_XYZT(gauge2); \ + PF_GAUGE_XYZT(gauge3); + +#define PREPARE_LS(X,Y,Z,T,skew,UU) \ + PREPARE(X,Y,Z,T,skew,UU); \ + PF_GAUGE_LS(gauge0); \ + PF_GAUGE_LS(gauge1); \ + PF_GAUGE_LS(gauge2); \ + PF_GAUGE_LS(gauge3); + +#define PREPARE(X,Y,Z,T,skew,UU) \ + SE0=st.GetEntry(ptype,X+skew,sF); \ + o0 = SE0->_offset; \ + l0 = SE0->_is_local; \ + p0 = SE0->_permute; \ + CONDITIONAL_MOVE(l0,o0,addr0); \ PF_CHI(addr0); \ \ SE1=st.GetEntry(ptype,Y+skew,sF); \ o1 = SE1->_offset; \ l1 = SE1->_is_local; \ p1 = SE1->_permute; \ - addr1 = l1 ? (uint64_t) &in._odata[o1] : (uint64_t) &buf[o1]; \ + CONDITIONAL_MOVE(l1,o1,addr1); \ PF_CHI(addr1); \ \ SE2=st.GetEntry(ptype,Z+skew,sF); \ o2 = SE2->_offset; \ l2 = SE2->_is_local; \ p2 = SE2->_permute; \ - addr2 = l2 ? (uint64_t) &in._odata[o2] : (uint64_t) &buf[o2]; \ + CONDITIONAL_MOVE(l2,o2,addr2); \ PF_CHI(addr2); \ \ SE3=st.GetEntry(ptype,T+skew,sF); \ o3 = SE3->_offset; \ l3 = SE3->_is_local; \ p3 = SE3->_permute; \ - addr3 = l3 ? (uint64_t) &in._odata[o3] : (uint64_t) &buf[o3]; \ + CONDITIONAL_MOVE(l3,o3,addr3); \ PF_CHI(addr3); \ \ gauge0 =(uint64_t)&UU._odata[sU]( X ); \ @@ -602,12 +645,13 @@ void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out) + SiteSpinor *buf, int LLs, + int sU, const FermionField &in, FermionField &out) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; uint64_t addr0,addr1,addr2,addr3; + const SiteSpinor *in_p; in_p = &in._odata[0]; int o0,o1,o2,o3; // offsets int l0,l1,l2,l3; // local @@ -618,42 +662,46 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl StencilEntry *SE2; StencilEntry *SE3; - // Xp, Yp, Zp, Tp + for(int s=0;s template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out) + SiteSpinor *buf, int LLs, + int sU, const FermionField &in, FermionField &out) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; uint64_t addr0,addr1,addr2,addr3; + const SiteSpinor *in_p; in_p = &in._odata[0]; int o0,o1,o2,o3; // offsets int l0,l1,l2,l3; // local @@ -664,30 +712,34 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl StencilEntry *SE2; StencilEntry *SE3; - // Xp, Yp, Zp, Tp + for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl VPERM0(Chi_12,Chi_12) ); // This is the single precision 5th direction vectorised kernel + #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out) + DoubledGaugeField &U, + DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, + int sU, const FermionField &in, FermionField &out) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; uint64_t addr0,addr1,addr2,addr3; + const SiteSpinor *in_p; in_p = &in._odata[0]; int o0,o1,o2,o3; // offsets int l0,l1,l2,l3; // local @@ -731,66 +785,46 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, StencilEntry *SE2; StencilEntry *SE3; - // Xp, Yp, Zp, Tp - PREPARE(Xp,Yp,Zp,Tp,0,U); - LOAD_CHIa(addr0,addr1); - if (l0&&p0) { PERMUTE_DIR3; } - if (l1&&p1) { PERMUTE_DIR2; } - MULT_XYZT(gauge0,gauge1); - LOAD_CHIa(addr2,addr3); - if (l2&&p2) { PERMUTE_DIR1; } - if (l3&&p3) { PERMUTE_DIR0; } - MULT_ADD_XYZT(gauge2,gauge3); + for(int s=0;s template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out) + DoubledGaugeField &U, + DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, + int sU, const FermionField &in, FermionField &out) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; uint64_t addr0,addr1,addr2,addr3; + const SiteSpinor *in_p; in_p = &in._odata[0]; int o0,o1,o2,o3; // offsets int l0,l1,l2,l3; // local @@ -801,57 +835,35 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, StencilEntry *SE2; StencilEntry *SE3; - // Xp, Yp, Zp, Tp + for(int s=0;s -void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, +void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, + int sU, const FermionField &in, FermionField &out, int dag) { + + SiteSpinor naik; + SiteSpinor naive; + int oneLink =0; + int threeLink=1; + int skew(0); + Real scale(1.0); + + if(dag) scale = -1.0; + + for(int s=0;s +void StaggeredKernels::DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor *buf, int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink) { { diff --git a/tests/core/Test_staggered5D.cc b/tests/core/Test_staggered5D.cc index be31c438..a7b00399 100644 --- a/tests/core/Test_staggered5D.cc +++ b/tests/core/Test_staggered5D.cc @@ -153,7 +153,7 @@ int main (int argc, char ** argv) std::cout< seeds({1,2,3,4}); + /* GridParallelRNG pRNG4(UGrid); GridParallelRNG pRNG5(FGrid); pRNG4.SeedFixedIntegers(seeds); pRNG5.SeedFixedIntegers(seeds); - + */ typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField; typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; typename ImprovedStaggeredFermion5DR::ImplParams params; - FermionField src (FGrid); + FermionField src (FGrid); src=zero; - random(pRNG5,src); + // random(pRNG5,src); /* std::vector site({0,0,0,0,0}); ColourVector cv = zero; @@ -80,10 +81,10 @@ int main (int argc, char ** argv) FermionField result(FGrid); result=zero; FermionField tmp(FGrid); tmp=zero; FermionField err(FGrid); tmp=zero; - FermionField phi (FGrid); random(pRNG5,phi); - FermionField chi (FGrid); random(pRNG5,chi); + FermionField phi (FGrid); phi=1.0;//random(pRNG5,phi); + FermionField chi (FGrid); chi=1.0;//random(pRNG5,chi); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG4,Umu); + LatticeGaugeField Umu(UGrid); Umu=1.0; //SU3::HotConfiguration(pRNG4,Umu); /* for(int mu=1;mu<4;mu++){ @@ -109,15 +110,18 @@ int main (int argc, char ** argv) std::cout<