diff --git a/benchmarks/Benchmark_staggered.cc b/benchmarks/Benchmark_staggered.cc index 9860e59d..121dc0d5 100644 --- a/benchmarks/Benchmark_staggered.cc +++ b/benchmarks/Benchmark_staggered.cc @@ -115,7 +115,7 @@ int main (int argc, char ** argv) ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params); std::cout<::DhopInternal(StencilImpl &st, LebesgueOrder if (dag == DaggerYes) { PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), sss, sss, in, out); + Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); } } else { PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), sss, sss, in, out); + Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); } } }; diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index 7068fc3f..fdbbc441 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -228,9 +228,7 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr const FermionField &in, FermionField &out,int dag) { Compressor compressor; - int LLs = in._grid->_rdimensions[0]; - st.HaloExchange(in,compressor); // Dhop takes the 4d grid from U, and makes a 5d index for fermion @@ -241,28 +239,11 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out); } } else { -#if 1 PARALLEL_FOR_LOOP for (int ss = 0; ss < U._grid->oSites(); ss++) { int sU=ss; Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); } -#else -#pragma omp parallel - { - for(int i=0;i<10;i++){ - int len = U._grid->oSites(); - int me,mywork,myoff; - GridThread::GetWorkBarrier(len,me, mywork,myoff); - for (int ss = myoff; ss < myoff+mywork; ss++) { - int sU=ss; - int sF=LLs*sU; - Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); - } - GridThread::ThreadBarrier(); - } - } -#endif } } diff --git a/lib/qcd/action/fermion/StaggeredKernels.cc b/lib/qcd/action/fermion/StaggeredKernels.cc index 597b14ea..a62daa13 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.cc +++ b/lib/qcd/action/fermion/StaggeredKernels.cc @@ -186,32 +186,31 @@ template void StaggeredKernels::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf, int LLs, int sU, const FermionField &in, FermionField &out) { - int dag(1); SiteSpinor naik; SiteSpinor naive; int oneLink =0; int threeLink=1; - Real scale; - if(dag) scale = -1.0; - else scale = 1.0; - + int dag=1; switch(Opt) { #ifdef AVX512 + //FIXME; move the sign into the Asm routine case OptInlineAsm: DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out); + for(int s=0;s::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, Dou template void StaggeredKernels::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) { - - int dag(0); - - int oneLink =0; - int threeLink=1; - SiteSpinor naik; - SiteSpinor naive; - static int once; - int sF=LLs*sU; - + int sU, const FermionField &in, FermionField &out) +{ + int oneLink =0; + int threeLink=1; + SiteSpinor naik; + SiteSpinor naive; + int dag=0; switch(Opt) { #ifdef AVX512 case OptInlineAsm: @@ -241,22 +236,23 @@ void StaggeredKernels::DhopSite(StencilImpl &st, LebesgueOrder &lo, Double break; #endif case OptHandUnroll: - DhopSiteDepthHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag); - break; + DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag); + break; case OptGeneric: - - for(int s=0;s=0); assert(sU>=0); + DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink); + DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink); + out._odata[sF] =naive+naik; + } break; default: assert(0); break; } - }; template diff --git a/lib/qcd/action/fermion/StaggeredKernels.h b/lib/qcd/action/fermion/StaggeredKernels.h index dc91a30c..a45214d3 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.h +++ b/lib/qcd/action/fermion/StaggeredKernels.h @@ -57,11 +57,11 @@ public: int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink); - void DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, + void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink); - void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf, - int Lls, int sU, const FermionField &in, FermionField &out, int dag); + void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf, + int LLs, int sU, const FermionField &in, FermionField &out, int dag); void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf, int LLs, int sU, const FermionField &in, FermionField &out); diff --git a/lib/qcd/action/fermion/StaggeredKernelsAsm.cc b/lib/qcd/action/fermion/StaggeredKernelsAsm.cc index 7f3624a1..0c62b2a0 100644 --- a/lib/qcd/action/fermion/StaggeredKernelsAsm.cc +++ b/lib/qcd/action/fermion/StaggeredKernelsAsm.cc @@ -517,7 +517,7 @@ Author: paboyle : : "r" (a0) : "%r8" ); \ #define PF_GAUGE_XYZT(a0) -#define PF_GAUGE_XYZTa(a0) \ +#define PF_GAUGE_XYZTa(a0) \ asm ( \ "movq %0, %%r8 \n\t" \ VPREFETCH1(0,%%r8) \ @@ -578,10 +578,10 @@ namespace QCD { template void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, + DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, + int sU, const FermionField &in, FermionField &out) { assert(0); @@ -611,35 +611,35 @@ void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, o0 = SE0->_offset; \ l0 = SE0->_is_local; \ p0 = SE0->_permute; \ - CONDITIONAL_MOVE(l0,o0,addr0); \ + CONDITIONAL_MOVE(l0,o0,addr0); \ PF_CHI(addr0); \ - \ - SE1=st.GetEntry(ptype,Y+skew,sF); \ - o1 = SE1->_offset; \ - l1 = SE1->_is_local; \ - p1 = SE1->_permute; \ - CONDITIONAL_MOVE(l1,o1,addr1); \ + \ + SE1=st.GetEntry(ptype,Y+skew,sF); \ + o1 = SE1->_offset; \ + l1 = SE1->_is_local; \ + p1 = SE1->_permute; \ + CONDITIONAL_MOVE(l1,o1,addr1); \ PF_CHI(addr1); \ - \ - SE2=st.GetEntry(ptype,Z+skew,sF); \ - o2 = SE2->_offset; \ - l2 = SE2->_is_local; \ - p2 = SE2->_permute; \ - CONDITIONAL_MOVE(l2,o2,addr2); \ + \ + SE2=st.GetEntry(ptype,Z+skew,sF); \ + o2 = SE2->_offset; \ + l2 = SE2->_is_local; \ + p2 = SE2->_permute; \ + CONDITIONAL_MOVE(l2,o2,addr2); \ PF_CHI(addr2); \ - \ - SE3=st.GetEntry(ptype,T+skew,sF); \ - o3 = SE3->_offset; \ - l3 = SE3->_is_local; \ - p3 = SE3->_permute; \ - CONDITIONAL_MOVE(l3,o3,addr3); \ + \ + SE3=st.GetEntry(ptype,T+skew,sF); \ + o3 = SE3->_offset; \ + l3 = SE3->_is_local; \ + p3 = SE3->_permute; \ + CONDITIONAL_MOVE(l3,o3,addr3); \ PF_CHI(addr3); \ \ - gauge0 =(uint64_t)&UU._odata[sU]( X ); \ - gauge1 =(uint64_t)&UU._odata[sU]( Y ); \ - gauge2 =(uint64_t)&UU._odata[sU]( Z ); \ + gauge0 =(uint64_t)&UU._odata[sU]( X ); \ + gauge1 =(uint64_t)&UU._odata[sU]( Y ); \ + gauge2 =(uint64_t)&UU._odata[sU]( Z ); \ gauge3 =(uint64_t)&UU._odata[sU]( T ); - + // This is the single precision 5th direction vectorised kernel #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, @@ -762,6 +762,14 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl VPERM0(Chi_11,Chi_11) \ VPERM0(Chi_12,Chi_12) ); +#define PERMUTE01 \ + if ( p0 ) { PERMUTE_DIR3; }\ + if ( p1 ) { PERMUTE_DIR2; } + +#define PERMUTE23 \ + if ( p2 ) { PERMUTE_DIR1; }\ + if ( p3 ) { PERMUTE_DIR0; } + // This is the single precision 5th direction vectorised kernel #include @@ -785,35 +793,50 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, @@ -835,31 +858,47 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s -void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out, int dag) { - - SiteSpinor naik; - SiteSpinor naive; - int oneLink =0; - int threeLink=1; - int skew(0); - Real scale(1.0); - - if(dag) scale = -1.0; - - for(int s=0;s::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, + int sU, const FermionField &in, FermionField &out, int dag) +{ + SiteSpinor naik; + SiteSpinor naive; + int oneLink =0; + int threeLink=1; + int skew(0); + Real scale(1.0); + + if(dag) scale = -1.0; + + for(int s=0;s -void StaggeredKernels::DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out,int threeLink) { +void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteSpinor *buf, int sF, + int sU, const FermionField &in, SiteSpinor &out,int threeLink) { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; @@ -300,7 +297,6 @@ void StaggeredKernels::DhopSiteDepthHandLocal(StencilImpl &st, LebesgueOrd vstream(out()()(1),even_1+odd_1); vstream(out()()(2),even_2+odd_2); - } } FermOpStaggeredTemplateInstantiate(StaggeredKernels); diff --git a/tests/core/Test_staggered5Dvec.cc b/tests/core/Test_staggered5Dvec.cc index 13374578..ef215ed4 100644 --- a/tests/core/Test_staggered5Dvec.cc +++ b/tests/core/Test_staggered5Dvec.cc @@ -57,34 +57,33 @@ int main (int argc, char ** argv) std::cout< seeds({1,2,3,4}); - /* + GridParallelRNG pRNG4(UGrid); GridParallelRNG pRNG5(FGrid); pRNG4.SeedFixedIntegers(seeds); pRNG5.SeedFixedIntegers(seeds); - */ + typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField; typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; typename ImprovedStaggeredFermion5DR::ImplParams params; - FermionField src (FGrid); src=zero; - - // random(pRNG5,src); + FermionField src (FGrid); + random(pRNG5,src); /* - std::vector site({0,0,0,0,0}); + std::vector site({0,1,2,0,0}); ColourVector cv = zero; cv()()(0)=1.0; src = zero; pokeSite(cv,src,site); */ - FermionField result(FGrid); result=zero; FermionField tmp(FGrid); tmp=zero; FermionField err(FGrid); tmp=zero; - FermionField phi (FGrid); phi=1.0;//random(pRNG5,phi); - FermionField chi (FGrid); chi=1.0;//random(pRNG5,chi); + FermionField phi (FGrid); random(pRNG5,phi); + FermionField chi (FGrid); random(pRNG5,chi); - LatticeGaugeField Umu(UGrid); Umu=1.0; //SU3::HotConfiguration(pRNG4,Umu); + LatticeGaugeField Umu(UGrid); + SU3::HotConfiguration(pRNG4,Umu); /* for(int mu=1;mu<4;mu++){ @@ -103,7 +102,7 @@ int main (int argc, char ** argv) RealD c2=-1.0/24.0; RealD u0=1.0; - ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0,params); + ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0,params); ImprovedStaggeredFermionVec5dR sDs(Umu,Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,c1,c2,u0,params); std::cout<