From 55f65b81b59f857287e6f7cab833644a28ad1baa Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 9 Jun 2016 01:12:36 -0700 Subject: [PATCH] Improvements to the assembler interface that let us move chunks of the site and s loop into the kernels. This will save on function call overhead and guarantee L2 prefetching strategy is right since OMP can't distribute the sub-chunks of work. --- benchmarks/Benchmark_dwf.cc | 6 ++- lib/qcd/action/fermion/WilsonFermion.cc | 4 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 54 ++----------------- lib/qcd/action/fermion/WilsonFermion5D.h | 7 +-- lib/qcd/action/fermion/WilsonKernels.cc | 33 +++++++++--- lib/qcd/action/fermion/WilsonKernels.h | 8 +-- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 20 +++---- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 12 ++++- lib/qcd/action/fermion/WilsonKernelsHand.cc | 8 +-- lib/simd/Intel512wilson.h | 12 +++++ 10 files changed, 77 insertions(+), 87 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index a40939fd..0bf57182 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -132,19 +132,21 @@ int main (int argc, char ** argv) RealD NP = UGrid->_Nprocessors; - for(int doasm=0;doasm<1;doasm++){ + for(int doasm=1;doasm<2;doasm++){ QCD::WilsonKernelsStatic::AsmOpt=doasm; DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); std::cout<oSites();sss++){ - Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out); + Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out); } } else { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out); + Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out); } } }; diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index a9928e00..66047b72 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -95,11 +95,6 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, // Allocate the required comms buffer ImportGauge(_Umu); - alltime=0; - commtime=0; - jointime=0; - dslashtime=0; - dslash1time=0; } template @@ -291,33 +286,6 @@ void WilsonFermion5D::DhopDerivEO(GaugeField &mat, } -template -void WilsonFermion5D::Report(void) -{ - return; -#if 0 - std::cout< void WilsonFermion5D::DhopDerivOE(GaugeField &mat, const FermionField &A, @@ -341,42 +309,28 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, const FermionField &in, FermionField &out,int dag) { // assert((dag==DaggerNo) ||(dag==DaggerYes)); - alltime-=usecond(); Compressor compressor(dag); int LLs = in._grid->_rdimensions[0]; - commtime -=usecond(); st.HaloExchange(in,compressor); - commtime +=usecond(); - - jointime -=usecond(); - jointime +=usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion - dslashtime -=usecond(); if ( dag == DaggerYes ) { PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ - for(int s=0;soSites();ss++){ - int sU=lo.Reorder(ss); + int sU=ss; int sF=LLs*sU; - for(int s=0;s Kernels; - double alltime; - double jointime; - double commtime; - double dslashtime; - double dslash1time; + /////////////////////////////////////////////////////////////// // Implement the abstract base /////////////////////////////////////////////////////////////// @@ -139,7 +135,6 @@ namespace Grid { // DoubleStore void ImportGauge(const GaugeField &_Umu); - void Report(void); /////////////////////////////////////////////////////////////// // Data members require to support the functionality /////////////////////////////////////////////////////////////// diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index a2184ca2..63ba553d 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -40,23 +40,42 @@ WilsonKernels::WilsonKernels(const ImplParams &p): Base(p) {}; template void WilsonKernels::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out) + int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out) { - if ( AsmOpt ) WilsonKernels::DiracOptAsmDhopSite(st,U,buf,sF,sU,in,out); - else if (HandOpt) WilsonKernels::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out); - else WilsonKernels::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out); + if ( AsmOpt ) { + + WilsonKernels::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out); + + } else { + + for(int site=0;site::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out); + else WilsonKernels::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out); + sF++; + } + sU++; + } + + } } template void WilsonKernels::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out) + int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out) { // No asm implementation yet. // if ( AsmOpt ) WilsonKernels::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out); // else - if (HandOpt) WilsonKernels::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out); - else WilsonKernels::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out); + for(int site=0;site::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out); + else WilsonKernels::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out); + sF++; + } + sU++; + } } diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 3cdcc08e..da6751dd 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -55,11 +55,11 @@ namespace Grid { void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out); + int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out); void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in,FermionField &out); + int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out); void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, @@ -69,7 +69,7 @@ namespace Grid { // Specialised variants void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out); + int sF,int sU, const FermionField &in, FermionField &out); void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, @@ -77,7 +77,7 @@ namespace Grid { void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out); + int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out); void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 7d6d838a..bccf72c7 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -41,7 +41,7 @@ namespace QCD { template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out) + int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out) { assert(0); } @@ -72,8 +72,8 @@ static int signInit = setupSigns(); template<> void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out) + std::vector > &buf, + int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #undef VMOVIDUP @@ -87,29 +87,29 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaug template<> void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out) + int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #endif template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); + int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); + int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); + int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); + int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); + int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); + int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); }} diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index 5bd91952..04ed5879 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -2,19 +2,24 @@ int locala,perma, ptypea; int localb,permb, ptypeb; uint64_t basea, baseb; - + uint64_t basex; const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; vComplexF *isigns = &signs[0]; MASK_REGS; + + for(int site=0;site::DiracOptHandDhopSite(StencilImpl &st,Dou std::vector > &buf, int sF,int sU,const FermionField &in, FermionField &out) { - DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3 + assert(0); } template<> @@ -815,7 +815,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st, std::vector > &buf, int sF,int sU,const FermionField &in, FermionField &out) { - DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 + assert(0); } template<> @@ -823,7 +823,7 @@ void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,Dou std::vector > &buf, int sF,int sU,const FermionField &in, FermionField &out) { - DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 + assert(0); } template<> @@ -831,7 +831,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st, std::vector > &buf, int sF,int sU,const FermionField &in, FermionField &out) { - DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 + assert(0); } diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 63470458..a2000839 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -631,6 +631,18 @@ Author: paboyle VMADDSUB(Z1,T2,UChi_11) \ VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \ VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \ + VPREFETCH2(12,%r9) \ + VPREFETCH2(13,%r9) \ + VPREFETCH2(14,%r9) \ + VPREFETCH2(15,%r9) \ + VPREFETCH2(16,%r9) \ + VPREFETCH2(17,%r9) \ + VPREFETCH2(18,%r9) \ + VPREFETCH2(19,%r9) \ + VPREFETCH2(20,%r9) \ + VPREFETCH2(21,%r9) \ + VPREFETCH2(22,%r9) \ + VPREFETCH2(23,%r9) \ /*38*/ \ VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \ VMADDSUB(Z3,Chi_11,UChi_10) \