From 2c246551d0b4265ac2da1e7e03f676e3d11d5d61 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 7 Feb 2017 01:37:10 -0500 Subject: [PATCH] Overlap comms and compute options in wilson kernels --- lib/qcd/action/fermion/WilsonCompressor.h | 21 +- lib/qcd/action/fermion/WilsonFermion.cc | 11 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 139 ++++- lib/qcd/action/fermion/WilsonFermion5D.h | 17 + lib/qcd/action/fermion/WilsonKernels.cc | 15 +- lib/qcd/action/fermion/WilsonKernels.h | 77 ++- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 50 +- .../action/fermion/WilsonKernelsAsmAvx512.h | 215 +++++++- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 498 +++++++++--------- lib/qcd/action/fermion/WilsonKernelsAsmQPX.h | 20 +- lib/qcd/action/fermion/WilsonKernelsHand.cc | 32 +- 11 files changed, 729 insertions(+), 366 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index 5b29c103..0257b880 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -180,26 +180,31 @@ namespace QCD { const std::vector &distances) : CartesianStencil (grid,npoints,checkerboard,directions,distances) { }; - template < class compressor> void HaloExchangeOpt(const Lattice &source,compressor &compress) { std::vector > reqs; + HaloExchangeOptGather(source,compress); + this->CommunicateBegin(reqs); + this->calls++; + this->CommunicateComplete(reqs); + this->CommsMerge(); + } + + template < class compressor> + void HaloExchangeOptGather(const Lattice &source,compressor &compress) + { + this->calls++; this->Mergers.resize(0); this->Packets.resize(0); this->HaloGatherOpt(source,compress); - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); - this->CommsMerge(); // spins - this->calls++; } template < class compressor> void HaloGatherOpt(const Lattice &source,compressor &compress) { - int face_idx=0; - + this->_grid->StencilBarrier(); // conformable(source._grid,_grid); assert(source._grid==this->_grid); this->halogtime-=usecond(); @@ -222,7 +227,9 @@ namespace QCD { // compress.Point(point); // HaloGatherDir(source,compress,point,face_idx); // } + int face_idx=0; if ( dag ) { + std::cout << " Optimised Dagger compress " <HaloGatherDir(source,XpCompress,Xp,face_idx); this->HaloGatherDir(source,YpCompress,Yp,face_idx); this->HaloGatherDir(source,ZpCompress,Zp,face_idx); diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index 04c3671f..f5b76c1a 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -224,7 +224,7 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, //////////////////////// PARALLEL_FOR_LOOP for (int sss = 0; sss < B._grid->oSites(); sss++) { - Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, + Kernels::DhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, gamma); } @@ -335,8 +335,7 @@ void WilsonFermion::DhopDirDisp(const FermionField &in, FermionField &out, PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, - dirdisp, gamma); + Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma); } }; @@ -353,14 +352,12 @@ void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, if (dag == DaggerYes) { PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, - out); + Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); } } else { PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, - out); + Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); } } }; diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 7fdceb2f..ad65b345 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -182,34 +182,34 @@ void WilsonFermion5D::Report(void) std::vector latt = GridDefaultLatt(); RealD volume = Ls; for(int mu=0;mu_Nprocessors; + RealD NN = _FourDimGrid->NodeCount(); if ( DhopCalls > 0 ) { std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " << DhopComputeTime << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D Number of DhopEO Calls : " << DhopCalls << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl; RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; - RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting + RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; - + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; } if ( DerivCalls > 0 ) { std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls : " <::ZeroCounters(void) { DhopCalls = 0; DhopCommTime = 0; DhopComputeTime = 0; + DhopComputeTime2= 0; + DhopFaceTime = 0; + DhopTotalTime = 0; DerivCalls = 0; DerivCommTime = 0; @@ -277,7 +280,7 @@ PARALLEL_FOR_LOOP for(int s=0;s::DerivInternal(StencilImpl & st, assert(sF < B._grid->oSites()); assert(sU < U._grid->oSites()); - Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma); + Kernels::DhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma); //////////////////////////// // spin trace outer product @@ -396,6 +399,86 @@ template void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) +{ + DhopTotalTime-=usecond(); +#ifdef GRID_OMP + if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,in,out,dag); + else +#endif + DhopInternalSerialComms(st,lo,U,in,out,dag); + DhopTotalTime+=usecond(); +} + +template +void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U, + const FermionField &in, FermionField &out,int dag) +{ +#ifdef GRID_OMP + // assert((dag==DaggerNo) ||(dag==DaggerYes)); + typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + + Compressor compressor(dag); + + int LLs = in._grid->_rdimensions[0]; + int len = U._grid->oSites(); + + DhopFaceTime-=usecond(); + st.HaloExchangeOptGather(in,compressor); + DhopFaceTime+=usecond(); + std::vector > reqs; + +#pragma omp parallel + { + int nthreads = omp_get_num_threads(); + int me = omp_get_thread_num(); + int myoff, mywork; + + GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1); + int sF = LLs * myoff; + + if ( me == 0 ) { + DhopCommTime-=usecond(); + st.CommunicateBegin(reqs); + st.CommunicateComplete(reqs); + DhopCommTime+=usecond(); + } else { + // Interior links in stencil + if ( me==1 ) DhopComputeTime-=usecond(); + if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); + else Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); + if ( me==1 ) DhopComputeTime+=usecond(); + } + } + + DhopFaceTime-=usecond(); + st.CommsMerge(); + DhopFaceTime+=usecond(); + +#pragma omp parallel + { + int nthreads = omp_get_num_threads(); + int me = omp_get_thread_num(); + int myoff, mywork; + + GridThread::GetWork(len,me,mywork,myoff,nthreads); + int sF = LLs * myoff; + + // Exterior links in stencil + if ( me==0 ) DhopComputeTime2-=usecond(); + if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); + else Kernels::DhopSite (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); + if ( me==0 ) DhopComputeTime2+=usecond(); + }// end parallel region +#else + assert(0); +#endif +} +template +void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U, + const FermionField &in, FermionField &out,int dag) { // assert((dag==DaggerNo) ||(dag==DaggerYes)); Compressor compressor(dag); @@ -408,12 +491,30 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DhopComputeTime-=usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion + if (dag == DaggerYes) { PARALLEL_FOR_LOOP for (int ss = 0; ss < U._grid->oSites(); ss++) { int sU = ss; int sF = LLs * sU; - Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out); + Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); + } + } else { + PARALLEL_FOR_LOOP + for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); + } + } + /* + + if (dag == DaggerYes) { + PARALLEL_FOR_LOOP + for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); } #ifdef AVX512_SWITCHOFF } else if (stat.is_init() ) { @@ -430,31 +531,35 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, for(int ss=0;ssoSites();ss++) { int sU=ss; int sF=LLs*sU; - Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); } stat.exit(mythread); } stat.accum(nthreads); #endif } else { -#if 0 +#if 1 PARALLEL_FOR_LOOP for (int ss = 0; ss < U._grid->oSites(); ss++) { int sU = ss; int sF = LLs * sU; - Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); } #else +#ifdef GRID_OMP #pragma omp parallel +#endif { int len = U._grid->oSites(); int me, myoff,mywork; GridThread::GetWorkBarrier(len,me, mywork,myoff); int sF = LLs * myoff; - Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out); + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out); } #endif } + */ + DhopComputeTime+=usecond(); } diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index fb4fa925..76a70d4d 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -82,6 +82,9 @@ namespace QCD { double DhopCalls; double DhopCommTime; double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; + double DhopTotalTime; double DerivCalls; double DerivCommTime; @@ -145,6 +148,20 @@ namespace QCD { const FermionField &in, FermionField &out, int dag); + + void DhopInternalOverlappedComms(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, + int dag); + + void DhopInternalSerialComms(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, + int dag); // Constructors WilsonFermion5D(GaugeField &_Umu, diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 392c7029..3a70bb5b 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -32,8 +32,8 @@ directory namespace Grid { namespace QCD { -int WilsonKernelsStatic::Opt; - + int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric; + int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute; #ifdef QPX #include @@ -87,9 +87,10 @@ WilsonKernels::WilsonKernels(const ImplParams &p) : Base(p){}; //////////////////////////////////////////// template -void WilsonKernels::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, +void WilsonKernels::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor *buf, int sF, - int sU, const FermionField &in, FermionField &out) { + int sU, const FermionField &in, FermionField &out, + int interior,int exterior) { SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -263,9 +264,9 @@ void WilsonKernels::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOr // Need controls to do interior, exterior, or both template -void WilsonKernels::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, +void WilsonKernels::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor *buf, int sF, - int sU, const FermionField &in, FermionField &out) { + int sU, const FermionField &in, FermionField &out,int interior,int exterior) { SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -438,7 +439,7 @@ void WilsonKernels::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder }; template -void WilsonKernels::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF, +void WilsonKernels::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF, int sU, const FermionField &in, FermionField &out, int dir, int gamma) { SiteHalfSpinor tmp; diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index c859b33d..20ee87f2 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -43,8 +43,10 @@ void bgq_l1p_optimisation(int mode); class WilsonKernelsStatic { public: enum { OptGeneric, OptHandUnroll, OptInlineAsm }; + enum { CommsAndCompute, CommsThenCompute }; // S-direction is INNERMOST and takes no part in the parity. static int Opt; // these are a temporary hack + static int Comms; // these are a temporary hack }; template class WilsonKernels : public FermionOperator , public WilsonKernelsStatic { @@ -57,20 +59,23 @@ public: template typename std::enable_if::type - DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) + DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) { bgq_l1p_optimisation(1); switch(Opt) { #if defined(AVX512) || defined (QPX) case OptInlineAsm: - WilsonKernels::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out); - break; + if(interior&&exterior) WilsonKernels::AsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + else if (interior) WilsonKernels::AsmDhopSiteInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + else if (exterior) WilsonKernels::AsmDhopSiteExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + else assert(0); + break; #endif case OptHandUnroll: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - WilsonKernels::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out); + if( exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior); sF++; } sU++; @@ -79,7 +84,7 @@ public: case OptGeneric: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - WilsonKernels::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out); + if( exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior); sF++; } sU++; @@ -93,12 +98,12 @@ public: template typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type - DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { + DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) { // no kernel choice for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - WilsonKernels::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out); + if( exterior) WilsonKernels::GenericDhopSite(st, lo, U, buf, sF, sU, in, out,interior,exterior); sF++; } sU++; @@ -107,20 +112,23 @@ public: template typename std::enable_if::type - DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { + DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) { bgq_l1p_optimisation(1); switch(Opt) { #if defined(AVX512) || defined (QPX) case OptInlineAsm: - WilsonKernels::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + if(interior&&exterior) WilsonKernels::AsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + else if (interior) WilsonKernels::AsmDhopSiteDagInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + else if (exterior) WilsonKernels::AsmDhopSiteDagExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + else assert(0); break; #endif case OptHandUnroll: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - WilsonKernels::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + if( exterior) WilsonKernels::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); sF++; } sU++; @@ -129,7 +137,7 @@ public: case OptGeneric: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - WilsonKernels::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + if( exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); sF++; } sU++; @@ -143,40 +151,53 @@ public: template typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type - DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { + DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) { for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - WilsonKernels::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + if( exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); sF++; } sU++; } } - void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf, + void DhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf, int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma); private: // Specialised variants - void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out); + void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); - void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out); + void GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); - void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); - void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + void AsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); - void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out); + void AsmDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); - void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out); + void AsmDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + + void AsmDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + + void AsmDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + + + void HandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + + void HandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); public: diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index ab805f4f..f627a939 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -35,19 +35,48 @@ Author: Guido Cossu namespace Grid { namespace QCD { - + + /////////////////////////////////////////////////////////// // Default to no assembler implementation /////////////////////////////////////////////////////////// template void -WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) { assert(0); } template void -WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +{ + assert(0); +} + +template void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +{ + assert(0); +} + +template void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +{ + assert(0); +} + +template void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +{ + assert(0); +} + +template void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) { assert(0); @@ -57,11 +86,22 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo, #include #define INSTANTIATE_ASM(A)\ -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ +template void WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ \ -template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ +template void WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ +template void WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ + \ +template void WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ +template void WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ + \ +template void WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ + INSTANTIATE_ASM(WilsonImplF); INSTANTIATE_ASM(WilsonImplD); diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h b/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h index 7b5b9803..6d602a2b 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h @@ -52,13 +52,37 @@ static Vector signsF; #define MAYBEPERM(A,perm) if (perm) { A ; } #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0]; + + +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR ///////////////////////////////////////////////////////////////// // XYZT vectorised, undag Kernel, single ///////////////////////////////////////////////////////////////// #undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR template<> void -WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include @@ -66,9 +90,28 @@ WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & // XYZT vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// #define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR template<> void -WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #undef MAYBEPERM @@ -80,8 +123,29 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder // Ls vectorised, undag Kernel, single ///////////////////////////////////////////////////////////////// #undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR template<> void -WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +#undef MULT_2SPIN +#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include @@ -89,10 +153,30 @@ WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,Lebesgu // Ls vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// #define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR template<> void -WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef COMPLEX_SIGNS #undef MAYBEPERM #undef MULT_2SPIN @@ -110,51 +194,130 @@ static int signInitD = setupSigns(signsD); #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0]; + +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR + ///////////////////////////////////////////////////////////////// -// XYZT Vectorised, undag Kernel, double +// XYZT vectorised, undag Kernel, single ///////////////////////////////////////////////////////////////// #undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR template<> void -WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include -///////////////////////////////////////////////////////////////// - +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + ///////////////////////////////////////////////////////////////// -// XYZT Vectorised, dag Kernel, double +// XYZT vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// #define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR template<> void -WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include -///////////////////////////////////////////////////////////////// +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef MAYBEPERM #undef MULT_2SPIN #define MAYBEPERM(A,B) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) -///////////////////////////////////////////////////////////////// -// Ls vectorised, undag Kernel, double -///////////////////////////////////////////////////////////////// -#undef KERNEL_DAG -template<> void -WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) -#include -///////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////// -// Ls vectorised, dag Kernel, double +// Ls vectorised, undag Kernel, single +///////////////////////////////////////////////////////////////// +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +#undef MULT_2SPIN +#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +///////////////////////////////////////////////////////////////// +// Ls vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// #define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR template<> void -WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include -///////////////////////////////////////////////////////////////// - + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef COMPLEX_SIGNS #undef MAYBEPERM #undef MULT_2SPIN diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index 8ec68997..34aba472 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -1,259 +1,267 @@ +#ifdef KERNEL_DAG +#define DIR0_PROJMEM(base) XP_PROJMEM(base); +#define DIR1_PROJMEM(base) YP_PROJMEM(base); +#define DIR2_PROJMEM(base) ZP_PROJMEM(base); +#define DIR3_PROJMEM(base) TP_PROJMEM(base); +#define DIR4_PROJMEM(base) XM_PROJMEM(base); +#define DIR5_PROJMEM(base) YM_PROJMEM(base); +#define DIR6_PROJMEM(base) ZM_PROJMEM(base); +#define DIR7_PROJMEM(base) TM_PROJMEM(base); +#define DIR0_RECON XP_RECON +#define DIR1_RECON YP_RECON_ACCUM +#define DIR2_RECON ZP_RECON_ACCUM +#define DIR3_RECON TP_RECON_ACCUM +#define DIR4_RECON XM_RECON_ACCUM +#define DIR5_RECON YM_RECON_ACCUM +#define DIR6_RECON ZM_RECON_ACCUM +#define DIR7_RECON TM_RECON_ACCUM +#else +#define DIR0_PROJMEM(base) XM_PROJMEM(base); +#define DIR1_PROJMEM(base) YM_PROJMEM(base); +#define DIR2_PROJMEM(base) ZM_PROJMEM(base); +#define DIR3_PROJMEM(base) TM_PROJMEM(base); +#define DIR4_PROJMEM(base) XP_PROJMEM(base); +#define DIR5_PROJMEM(base) YP_PROJMEM(base); +#define DIR6_PROJMEM(base) ZP_PROJMEM(base); +#define DIR7_PROJMEM(base) TP_PROJMEM(base); +#define DIR0_RECON XM_RECON +#define DIR1_RECON YM_RECON_ACCUM +#define DIR2_RECON ZM_RECON_ACCUM +#define DIR3_RECON TM_RECON_ACCUM +#define DIR4_RECON XP_RECON_ACCUM +#define DIR5_RECON YP_RECON_ACCUM +#define DIR6_RECON ZP_RECON_ACCUM +#define DIR7_RECON TP_RECON_ACCUM +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Comms then compute kernel +//////////////////////////////////////////////////////////////////////////////// +#ifdef INTERIOR_AND_EXTERIOR + +#define ZERO_NMU(A) +#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) +#define EXTERIOR_BLOCK_XP(a,b,RECON) EXTERIOR_BLOCK(a,b,RECON) + +#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ + LOAD64(%r10,isigns); \ + PROJMEM(base); \ + MAYBEPERM(PERMUTE_DIR,perm); + +#define EXTERIOR_BLOCK(a,b,RECON) \ + LOAD_CHI(base); + +#define COMMON_BLOCK(a,b,RECON) \ + base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ + MULT_2SPIN_DIR_PF(a,basep); \ + LOAD64(%r10,isigns); \ + RECON; + +#define RESULT(base,basep) SAVE_RESULT(base,basep); + +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Pre comms kernel -- prefetch like normal because it is mostly right +//////////////////////////////////////////////////////////////////////////////// +#ifdef INTERIOR + +#define COMMON_BLOCK(a,b,RECON) +#define ZERO_NMU(A) + +// No accumulate for DIR0 +#define EXTERIOR_BLOCK_XP(a,b,RECON) \ + ZERO_PSI; \ + base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; + +#define EXTERIOR_BLOCK(a,b,RECON) \ + base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; + +#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) + +#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ + LOAD64(%r10,isigns); \ + PROJMEM(base); \ + MAYBEPERM(PERMUTE_DIR,perm); \ + base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ + MULT_2SPIN_DIR_PF(a,basep); \ + LOAD64(%r10,isigns); \ + RECON; + +#define RESULT(base,basep) SAVE_RESULT(base,basep); + +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Post comms kernel +//////////////////////////////////////////////////////////////////////////////// +#ifdef EXTERIOR + +#define ZERO_NMU(A) nmu=0; + +#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) \ + ZERO_PSI; base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; + +#define EXTERIOR_BLOCK_XP(a,b,RECON) EXTERIOR_BLOCK(a,b,RECON) + +#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ + base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; + +#define EXTERIOR_BLOCK(a,b,RECON) \ + nmu++; \ + LOAD_CHI(base); \ + MULT_2SPIN_DIR_PF(a,base); \ + base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ + LOAD64(%r10,isigns); \ + RECON; + +#define COMMON_BLOCK(a,b,RECON) + +#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} + +#endif + { + int nmu; int local,perm, ptype; uint64_t base; uint64_t basep; const uint64_t plocal =(uint64_t) & in._odata[0]; - // vComplexF isigns[2] = { signs[0], signs[1] }; - //COMPLEX_TYPE is vComplexF of vComplexD depending - //on the chosen precision COMPLEX_SIGNS(isigns); MASK_REGS; int nmax=U._grid->oSites(); for(int site=0;site=nmax) ssn=0; + int sUn=lo.Reorder(ssn); +#ifndef EXTERIOR + LOCK_GAUGE(0); +#endif + for(int s=0;s=nmax) ssn=0; - int sUn=lo.Reorder(ssn); - for(int s=0;s shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - YP_PROJMEM(base); -#else - YM_PROJMEM(base); -#endif - MAYBEPERM(PERMUTE_DIR2,perm); - } else { - LOAD_CHI(base); - } - base = st.GetInfo(ptype,local,perm,Zp,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFYP(Yp,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - YP_RECON_ACCUM; -#else - YM_RECON_ACCUM; -#endif - - //////////////////////////////// - // Zp - //////////////////////////////// - basep = st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - ZP_PROJMEM(base); -#else - ZM_PROJMEM(base); -#endif - MAYBEPERM(PERMUTE_DIR1,perm); - } else { - LOAD_CHI(base); - } - base = st.GetInfo(ptype,local,perm,Tp,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFZP(Zp,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - ZP_RECON_ACCUM; -#else - ZM_RECON_ACCUM; -#endif - - //////////////////////////////// - // Tp - //////////////////////////////// - basep = st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - TP_PROJMEM(base); -#else - TM_PROJMEM(base); -#endif - MAYBEPERM(PERMUTE_DIR0,perm); - } else { - LOAD_CHI(base); - } - base = st.GetInfo(ptype,local,perm,Xm,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFTP(Tp,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - TP_RECON_ACCUM; -#else - TM_RECON_ACCUM; -#endif - - //////////////////////////////// - // Xm - //////////////////////////////// -#ifndef STREAM_STORE - basep= (uint64_t) &out._odata[ss]; -#endif - // basep= st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - XM_PROJMEM(base); -#else - XP_PROJMEM(base); -#endif - MAYBEPERM(PERMUTE_DIR3,perm); - } else { - LOAD_CHI(base); - } - base = st.GetInfo(ptype,local,perm,Ym,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFXM(Xm,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - XM_RECON_ACCUM; -#else - XP_RECON_ACCUM; -#endif - - //////////////////////////////// - // Ym - //////////////////////////////// - basep= st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - YM_PROJMEM(base); -#else - YP_PROJMEM(base); -#endif - MAYBEPERM(PERMUTE_DIR2,perm); - } else { - LOAD_CHI(base); - } - base = st.GetInfo(ptype,local,perm,Zm,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFYM(Ym,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - YM_RECON_ACCUM; -#else - YP_RECON_ACCUM; -#endif - - //////////////////////////////// - // Zm - //////////////////////////////// - basep= st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - ZM_PROJMEM(base); -#else - ZP_PROJMEM(base); -#endif - MAYBEPERM(PERMUTE_DIR1,perm); - } else { - LOAD_CHI(base); - } - base = st.GetInfo(ptype,local,perm,Tm,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFZM(Zm,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - ZM_RECON_ACCUM; -#else - ZP_RECON_ACCUM; -#endif - - //////////////////////////////// - // Tm - //////////////////////////////// - basep= st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - TM_PROJMEM(base); -#else - TP_PROJMEM(base); -#endif - MAYBEPERM(PERMUTE_DIR0,perm); - } else { - LOAD_CHI(base); - } - base= (uint64_t) &out._odata[ss]; -#ifndef STREAM_STORE - PREFETCH_CHIMU(base); -#endif - { - MULT_2SPIN_DIR_PFTM(Tm,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit -#ifdef KERNEL_DAG - TM_RECON_ACCUM; -#else - TP_RECON_ACCUM; -#endif - - basep= st.GetPFInfo(nent,plocal); nent++; - SAVE_RESULT(base,basep); - - } - ssU++; - UNLOCK_GAUGE(0); + base = (uint64_t) &out._odata[ss]; + basep= st.GetPFInfo(nent,plocal); nent++; + RESULT(base,basep); + } + ssU++; + UNLOCK_GAUGE(0); } } + +#undef DIR0_PROJMEM +#undef DIR1_PROJMEM +#undef DIR2_PROJMEM +#undef DIR3_PROJMEM +#undef DIR4_PROJMEM +#undef DIR5_PROJMEM +#undef DIR6_PROJMEM +#undef DIR7_PROJMEM +#undef DIR0_RECON +#undef DIR1_RECON +#undef DIR2_RECON +#undef DIR3_RECON +#undef DIR4_RECON +#undef DIR5_RECON +#undef DIR6_RECON +#undef DIR7_RECON +#undef EXTERIOR_BLOCK +#undef INTERIOR_BLOCK +#undef EXTERIOR_BLOCK_XP +#undef INTERIOR_BLOCK_XP +#undef COMMON_BLOCK +#undef ZERO_NMU +#undef RESULT diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmQPX.h b/lib/qcd/action/fermion/WilsonKernelsAsmQPX.h index 947538ca..612234d7 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmQPX.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmQPX.h @@ -42,13 +42,17 @@ Author: paboyle #define MAYBEPERM(A,perm) if (perm) { A ; } #define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf) #define COMPLEX_SIGNS(isigns) + +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR ///////////////////////////////////////////////////////////////// // XYZT vectorised, undag Kernel, single ///////////////////////////////////////////////////////////////// #undef KERNEL_DAG template<> void -WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include @@ -57,7 +61,7 @@ WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & ///////////////////////////////////////////////////////////////// #define KERNEL_DAG template<> void -WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include @@ -71,7 +75,7 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder ///////////////////////////////////////////////////////////////// #undef KERNEL_DAG template<> void -WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include @@ -80,7 +84,7 @@ WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,Lebesgu ///////////////////////////////////////////////////////////////// #define KERNEL_DAG template<> void -WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #undef MAYBEPERM @@ -100,7 +104,7 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,Lebe ///////////////////////////////////////////////////////////////// #undef KERNEL_DAG template<> void -WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include ///////////////////////////////////////////////////////////////// @@ -111,7 +115,7 @@ WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & ///////////////////////////////////////////////////////////////// #define KERNEL_DAG template<> void -WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include ///////////////////////////////////////////////////////////////// @@ -125,7 +129,7 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder ///////////////////////////////////////////////////////////////// #undef KERNEL_DAG template<> void -WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include ///////////////////////////////////////////////////////////////// @@ -135,7 +139,7 @@ WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,Lebesgu ///////////////////////////////////////////////////////////////// #define KERNEL_DAG template<> void -WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include ///////////////////////////////////////////////////////////////// diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index f5900832..90496bdf 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -312,8 +312,8 @@ namespace QCD { template void -WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out) +WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior) { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; @@ -554,8 +554,8 @@ WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,Doub } template -void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out) +void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior) { // std::cout << "Hand op Dhop "<::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder // Specialise Gparity to simple implementation //////////////////////////////////////////////// template<> void -WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, +WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out) + int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) { assert(0); } template<> void -WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, +WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out) + int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) { assert(0); } template<> void -WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out) +WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) { assert(0); } template<> void -WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out) +WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) { assert(0); } @@ -835,10 +835,10 @@ WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,Lebes // Need Nc=3 though // #define INSTANTIATE_THEM(A) \ -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ - int ss,int sU,const FermionField &in, FermionField &out); \ -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ - int ss,int sU,const FermionField &in, FermionField &out); +template void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior); \ +template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior); INSTANTIATE_THEM(WilsonImplF); INSTANTIATE_THEM(WilsonImplD);