diff --git a/.gitignore b/.gitignore index 03554467..80ea8e86 100644 --- a/.gitignore +++ b/.gitignore @@ -62,6 +62,7 @@ stamp-h1 config.sub config.guess INSTALL +.dirstamp # Packages # ############ diff --git a/.travis.yml b/.travis.yml index 82066d87..a2154ead 100644 --- a/.travis.yml +++ b/.travis.yml @@ -88,3 +88,7 @@ script: - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 + - make clean + - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=double --enable-simd=SSE4 --enable-comms=none + - make -j4 + - ./benchmarks/Benchmark_dwf --threads 1 diff --git a/VERSION b/VERSION new file mode 100644 index 00000000..c12f9497 --- /dev/null +++ b/VERSION @@ -0,0 +1,4 @@ +Version : 0.5.0 + +- AVX512, AVX2, AVX, SSE good +- Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 0bf57182..5ceec95f 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -165,11 +165,11 @@ int main (int argc, char ** argv) if (1) { - typedef WilsonFermion5D WilsonFermion5DF; - LatticeFermionF ssrc(sFGrid); - LatticeFermionF sref(sFGrid); - LatticeFermionF sresult(sFGrid); - WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params); + typedef WilsonFermion5D WilsonFermion5DR; + LatticeFermion ssrc(sFGrid); + LatticeFermion sref(sFGrid); + LatticeFermion sresult(sFGrid); + WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params); for(int x=0;x site({s,x,y,z,t}); - SpinColourVectorF tmp; + SpinColourVector tmp; peekSite(tmp,src,site); pokeSite(tmp,ssrc,site); }}}}} @@ -217,7 +217,7 @@ int main (int argc, char ** argv) for(int t=0;t site({s,x,y,z,t}); - SpinColourVectorF normal, simd; + SpinColourVector normal, simd; peekSite(normal,result,site); peekSite(simd,sresult,site); sum=sum+norm2(normal-simd); @@ -230,8 +230,8 @@ int main (int argc, char ** argv) if (1) { - LatticeFermionF sr_eo(sFGrid); - LatticeFermionF serr(sFGrid); + LatticeFermion sr_eo(sFGrid); + LatticeFermion serr(sFGrid); LatticeFermion ssrc_e (sFrbGrid); LatticeFermion ssrc_o (sFrbGrid); diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc new file mode 100644 index 00000000..7b4eb66b --- /dev/null +++ b/benchmarks/Benchmark_dwf_sweep.cc @@ -0,0 +1,369 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_dwf.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT + }; + +void benchDw(std::vector & L, int Ls, int threads, int report =0 ); +void benchsDw(std::vector & L, int Ls, int threads, int report=0 ); + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=16; + int threads = GridThread::GetThreads(); + std::cout< latt4(4,L); + for(int d=4;d>dmin;d--){ + if ( d<=3 ) latt4[d]*=2; + std::cout << GridLogMessage <<"\t"; + for(int d=0;d latt4(4,16); + std::cout< & latt4, int Ls, int threads,int report ) +{ + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + +#ifdef CHECK + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + LatticeFermion src (FGrid); random(RNG5,src); + LatticeGaugeField Umu(UGrid); + random(RNG4,Umu); +#else + LatticeFermion src (FGrid); src=zero; + LatticeGaugeField Umu(UGrid); Umu=zero; +#endif + + LatticeFermion result(FGrid); result=zero; + LatticeFermion ref(FGrid); ref=zero; + LatticeFermion tmp(FGrid); + LatticeFermion err(FGrid); + + ColourMatrix cm = Complex(1.0,0.0); + + + LatticeGaugeField Umu5d(FGrid); + + // replicate across fifth dimension + for(int ss=0;ssoSites();ss++){ + for(int s=0;s U(4,FGrid); + for(int mu=0;mu(Umu5d,mu); + } + +#ifdef CHECK + if (1) + { + ref = zero; + for(int mu=0;mu_Nprocessors; + + DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + double t0=usecond(); + Dw.Dhop(src,result,0); + double t1=usecond(); + +#ifdef TIMERS_OFF + int ncall =10; +#else + int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); +#endif + + if (ncall < 5 ) exit(0); + + Dw.Dhop(src,result,0); + + PerformanceCounter Counter(8); + Counter.Start(); + t0=usecond(); + for(int i=0;i 1.0e-4 ) { + std::cout< & latt4, int Ls, int threads, int report ) +{ + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi()); + GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid); + GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + +#ifdef CHECK_SDW + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + LatticeFermion src (FGrid); random(RNG5,src); + LatticeGaugeField Umu(UGrid); + random(RNG4,Umu); +#else + LatticeFermion src (FGrid); src=zero; + LatticeGaugeField Umu(UGrid); Umu=zero; +#endif + + LatticeFermion result(FGrid); result=zero; + LatticeFermion ref(FGrid); ref=zero; + LatticeFermion tmp(FGrid); + LatticeFermion err(FGrid); + + ColourMatrix cm = Complex(1.0,0.0); + + LatticeGaugeField Umu5d(FGrid); + + // replicate across fifth dimension + for(int ss=0;ssoSites();ss++){ + for(int s=0;s WilsonFermion5DR; + LatticeFermion ssrc(sFGrid); + LatticeFermion sref(sFGrid); + LatticeFermion sresult(sFGrid); + WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5); + + for(int x=0;x site({s,x,y,z,t}); + SpinColourVector tmp; + peekSite(tmp,src,site); + pokeSite(tmp,ssrc,site); + }}}}} + + double t0=usecond(); + sDw.Dhop(ssrc,sresult,0); + double t1=usecond(); + +#ifdef TIMERS_OFF + int ncall =10; +#else + int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); +#endif + + PerformanceCounter Counter(8); + Counter.Start(); + t0=usecond(); + for(int i=0;i > u_simd_send_buf; diff --git a/lib/algorithms/approx/.dirstamp b/lib/algorithms/approx/.dirstamp new file mode 100644 index 00000000..e69de29b diff --git a/lib/communicator/.dirstamp b/lib/communicator/.dirstamp new file mode 100644 index 00000000..e69de29b diff --git a/lib/qcd/action/fermion/.dirstamp b/lib/qcd/action/fermion/.dirstamp new file mode 100644 index 00000000..e69de29b diff --git a/lib/qcd/action/fermion/DomainWallFermion.h b/lib/qcd/action/fermion/DomainWallFermion.h index b05733aa..8e41aa63 100644 --- a/lib/qcd/action/fermion/DomainWallFermion.h +++ b/lib/qcd/action/fermion/DomainWallFermion.h @@ -63,7 +63,7 @@ namespace Grid { Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham assert(zdata->n==this->Ls); - std::cout< -void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, +void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out) + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #undef VMOVIDUP #undef VMOVRDUP #undef MAYBEPERM #undef MULT_2SPIN +#undef FX +#define FX(A) DWFASM_ ## A #define MAYBEPERM(A,B) #define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) #define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) template<> -void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, +void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out) + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #endif -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); +template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); +template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); +template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); +template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, - int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); }} diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index aae049e2..4f3ef861 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -1,35 +1,44 @@ { - int locala,perma, ptypea; - int localb,permb, ptypeb; - uint64_t basea, baseb; - uint64_t basex; + int local,perm, ptype; + uint64_t base; + uint64_t basep; const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; vComplexF *isigns = &signs[0]; MASK_REGS; - + int nmax=U._grid->oSites(); for(int site=0;site=nmax) ssn=0; + int sUn=lo.Reorder(ssn); for(int s=0;s shuffle and xor the real part sign bit - YM_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR2,permb); + YM_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR2,perm); } else { - LOAD_CHI(baseb); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Zp,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFYP(Yp,basea); + MULT_2SPIN_DIR_PFYP(Yp,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_RECON_ACCUM; @@ -54,16 +65,18 @@ //////////////////////////////// // Zp //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; - if ( locala ) { + basep = st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - ZM_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR1,perma); + ZM_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR1,perm); } else { - LOAD_CHI(basea); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Tp,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFZP(Zp,baseb); + MULT_2SPIN_DIR_PFZP(Zp,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_RECON_ACCUM; @@ -71,16 +84,18 @@ //////////////////////////////// // Tp //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; - if ( localb ) { + basep = st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - TM_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR0,permb); + TM_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR0,perm); } else { - LOAD_CHI(baseb); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Xm,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFTP(Tp,basea); + MULT_2SPIN_DIR_PFTP(Tp,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_RECON_ACCUM; @@ -88,16 +103,19 @@ //////////////////////////////// // Xm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; - if ( locala ) { + basep= (uint64_t) &out._odata[ss]; + // basep= st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - XP_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR3,perma); + XP_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR3,perm); } else { - LOAD_CHI(basea); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Ym,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFXM(Xm,baseb); + MULT_2SPIN_DIR_PFXM(Xm,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_RECON_ACCUM; @@ -105,16 +123,18 @@ //////////////////////////////// // Ym //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; - if ( localb ) { + basep= st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - YP_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR2,permb); + YP_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR2,perm); } else { - LOAD_CHI(baseb); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Zm,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFYM(Ym,basea); + MULT_2SPIN_DIR_PFYM(Ym,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YP_RECON_ACCUM; @@ -122,16 +142,18 @@ //////////////////////////////// // Zm //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; - if ( locala ) { + basep= st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - ZP_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR1,perma); + ZP_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR1,perm); } else { - LOAD_CHI(basea); + LOAD_CHI(base); } + base = st.GetInfo(ptype,local,perm,Tm,ent,plocal); ent++; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFZM(Zm,baseb); + MULT_2SPIN_DIR_PFZM(Zm,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_RECON_ACCUM; @@ -139,26 +161,26 @@ //////////////////////////////// // Tm //////////////////////////////// - basea = (uint64_t)&out._odata[ss]; - if ( localb ) { + basep= st.GetPFInfo(nent,plocal); nent++; + if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - TP_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR0,permb); + TP_PROJMEM(base); + MAYBEPERM(PERMUTE_DIR0,perm); } else { - LOAD_CHI(baseb); + LOAD_CHI(base); } + base= (uint64_t) &out._odata[ss]; + PREFETCH_CHIMU(base); { - MULT_2SPIN_DIR_PFTM(Tm,basea); + MULT_2SPIN_DIR_PFTM(Tm,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_RECON_ACCUM; - PREFETCH_CHIMU(basex); - SAVE_RESULT(&out._odata[ss]); - + basep= st.GetPFInfo(nent,plocal); nent++; + SAVE_RESULT(base,basep); - ss++; - } - sU++; + } + ssU++; } } diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab new file mode 100644 index 00000000..d50999f6 --- /dev/null +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab @@ -0,0 +1,161 @@ +{ + int locala,perma, ptypea; + int localb,permb, ptypeb; + uint64_t basea, baseb; + const uint64_t plocal =(uint64_t) & in._odata[0]; + + // vComplexF isigns[2] = { signs[0], signs[1] }; + vComplexF *isigns = &signs[0]; + + MASK_REGS; + + for(int site=0;site shuffle and xor the real part sign bit + YM_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR2,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFYP(Yp,basea); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YM_RECON_ACCUM; + + //////////////////////////////// + // Zp + //////////////////////////////// + baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++; + if ( locala ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZM_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR1,perma); + } else { + LOAD_CHI(basea); + } + { + MULT_2SPIN_DIR_PFZP(Zp,baseb); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZM_RECON_ACCUM; + + //////////////////////////////// + // Tp + //////////////////////////////// + basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++; + if ( localb ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TM_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR0,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFTP(Tp,basea); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TM_RECON_ACCUM; + + //////////////////////////////// + // Xm + //////////////////////////////// + baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++; + if ( locala ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + XP_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR3,perma); + } else { + LOAD_CHI(basea); + } + { + MULT_2SPIN_DIR_PFXM(Xm,baseb); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + XP_RECON_ACCUM; + + //////////////////////////////// + // Ym + //////////////////////////////// + basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++; + if ( localb ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR2,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFYM(Ym,basea); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YP_RECON_ACCUM; + + //////////////////////////////// + // Zm + //////////////////////////////// + baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++; + if ( locala ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZP_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR1,perma); + } else { + LOAD_CHI(basea); + } + { + MULT_2SPIN_DIR_PFZM(Zm,baseb); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZP_RECON_ACCUM; + + //////////////////////////////// + // Tm + //////////////////////////////// + basea = (uint64_t)&out._odata[ss]; + if ( localb ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR0,permb); + } else { + LOAD_CHI(baseb); + } + baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); + { + MULT_2SPIN_DIR_PFTM(Tm,basea); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TP_RECON_ACCUM; + + SAVE_RESULT(&out._odata[ss],baseb); + + } + ssU++; + } +} diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc new file mode 100644 index 00000000..5a3e01f7 --- /dev/null +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc @@ -0,0 +1,187 @@ +{ + int locala,perma, ptypea; + int localb,permb, ptypeb; + int localc,permc, ptypec; + uint64_t basea, baseb, basec; + uint64_t basex; + const uint64_t plocal =(uint64_t) & in._odata[0]; + + // vComplexF isigns[2] = { signs[0], signs[1] }; + vComplexF *isigns = &signs[0]; + + MASK_REGS; + + for(int site=0;site shuffle and xor the real part sign bit + YM_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR2,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFYP(Yp,basec); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YM_RECON_ACCUM; + + //////////////////////////////// + // Zp + //////////////////////////////// + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + label(FX(ZP) ); + if ( localc ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZM_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR1,permc); + } else { + LOAD_CHI(basec); + } + { + MULT_2SPIN_DIR_PFZP(Zp,basea); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZM_RECON_ACCUM; + + //////////////////////////////// + // Tp + //////////////////////////////// + basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++; + PREFETCH_CHIMU(basec); + label(FX(TP) ); + if ( locala ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TM_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR0,perma); + } else { + LOAD_CHI(basea); + } + { + MULT_2SPIN_DIR_PFTP(Tp,baseb); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TM_RECON_ACCUM; + + //////////////////////////////// + // Xm + //////////////////////////////// + basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++; + PREFETCH_CHIMU(basea); + label(FX(XM) ); + if ( localb ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + XP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR3,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFXM(Xm,basec); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + XP_RECON_ACCUM; + + //////////////////////////////// + // Ym + //////////////////////////////// + baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + label(FX(YM) ); + if ( localc ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YP_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR2,permc); + } else { + LOAD_CHI(basec); + } + { + MULT_2SPIN_DIR_PFYM(Ym,basea); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YP_RECON_ACCUM; + + //////////////////////////////// + // Zm + //////////////////////////////// + basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++; + PREFETCH_CHIMU(basec); + label(FX(ZM) ); + if ( locala ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZP_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR1,perma); + } else { + LOAD_CHI(basea); + } + { + MULT_2SPIN_DIR_PFZM(Zm,baseb); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZP_RECON_ACCUM; + + //////////////////////////////// + // Tm + //////////////////////////////// + basea = (uint64_t)&out._odata[ss]; + PREFETCH_CHIMU(basea); + label(FX(TM) ); + if ( localb ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR0,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFTM(Tm,basec); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TP_RECON_ACCUM; + + // PREFETCH_CHIMU(basex); + label(FX(SAV) ); + SAVE_RESULT(&out._odata[ss]); + + } + ssU++; + } +} diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 757778d3..cb6c01a1 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -312,7 +312,7 @@ namespace QCD { template -void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, +void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out) { @@ -555,7 +555,7 @@ void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel } template -void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, +void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out) { @@ -803,7 +803,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF // Specialise Gparity to simple implementation //////////////////////////////////////////////// template<> -void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, +void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int sF,int sU,const FermionField &in, FermionField &out) { @@ -811,7 +811,7 @@ void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,Dou } template<> -void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, +void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int sF,int sU,const FermionField &in, FermionField &out) { @@ -819,7 +819,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st, } template<> -void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, +void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int sF,int sU,const FermionField &in, FermionField &out) { @@ -827,7 +827,7 @@ void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,Dou } template<> -void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, +void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int sF,int sU,const FermionField &in, FermionField &out) { @@ -839,44 +839,44 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st, ////////////// Wilson ; uses this implementation ///////////////////// // Need Nc=3 though // -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, +template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); diff --git a/lib/qcd/hmc/.dirstamp b/lib/qcd/hmc/.dirstamp new file mode 100644 index 00000000..e69de29b diff --git a/lib/qcd/spin/.dirstamp b/lib/qcd/spin/.dirstamp new file mode 100644 index 00000000..e69de29b diff --git a/lib/qcd/utils/.dirstamp b/lib/qcd/utils/.dirstamp new file mode 100644 index 00000000..e69de29b diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index 2fea9235..dabbf6d8 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -1,4 +1,4 @@ - /************************************************************************************* + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -28,6 +28,11 @@ Author: paboyle #ifndef GRID_ASM_INTEL_COMMON_512_H #define GRID_ASM_INTEL_COMMON_512_H +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Peformance options +//////////////////////////////////////////////////////////////////////////////////////////////////// +#undef AVX512_PF_L2_WRITE + //////////////////////////////////////////////////////////////////////////////////////////////////// // Opcodes common //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -37,6 +42,8 @@ Author: paboyle "mov $0x5555, %%eax \n"\ "kmovw %%eax, %%k7 \n" : : : "%eax"); +//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" ); + #define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n" #define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n" @@ -86,9 +93,16 @@ Author: paboyle #define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" #define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n" -#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" +#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n" #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" +#ifdef AVX512_PF_L2_WRITE #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" +#else +#define VPREFETCHW(O,A) +#endif +#define VPREFETCHNTA(O,A) +#define VPREFETCH(O,A) + #define VEVICT(O,A) //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n" @@ -124,8 +138,6 @@ Author: paboyle #define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri) #define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri) -#define VPREFETCHNTA(O,A) -#define VPREFETCH(O,A) #define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 1955cc6d..660d07d6 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -104,7 +104,7 @@ Author: paboyle #define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi ); #define SAVE_UCHI(PTR) SAVE_UCHIi(PTR) #define SAVE_CHI(PTR) SAVE_CHIi(PTR) -#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR) +#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R) #define LOAD_CHIMUi \ LOAD_CHIMU01i \ @@ -169,22 +169,6 @@ Author: paboyle VSTORE(5,%r8,Chi_12) \ ); -#define SAVE_RESULTi(PTR)\ - LOAD64(%r8,PTR) \ - __asm__ ( \ - VSTORE(0,%r8,result_00) \ - VSTORE(1,%r8,result_01) \ - VSTORE(2,%r8,result_02) \ - VSTORE(3,%r8,result_10) \ - VSTORE(4,%r8,result_11) \ - VSTORE(5,%r8,result_12) \ - VSTORE(6,%r8,result_20) \ - VSTORE(7,%r8,result_21) \ - VSTORE(8,%r8,result_22) \ - VSTORE(9,%r8,result_30) \ - VSTORE(10,%r8,result_31) \ - VSTORE(11,%r8,result_32) \ - ); #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p) #define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p) @@ -277,8 +261,8 @@ Author: paboyle #define XM_PROJMEM(PTR) \ LOAD64(%r8,PTR)\ __asm__ ( \ - SHUF_CHIMU23i \ LOAD_CHIi \ + SHUF_CHIMU23i \ VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\ VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\ VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\ @@ -306,8 +290,8 @@ Author: paboyle #define ZM_PROJMEM(PTR) \ LOAD64(%r8,PTR) \ __asm__ ( \ - SHUF_CHIMU23i \ LOAD_CHIi \ + SHUF_CHIMU23i \ VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\ VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\ VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\ @@ -559,23 +543,95 @@ Author: paboyle VSUB(UChi_02,result_22,result_22)\ VSUB(UChi_12,result_32,result_32) ); -#define PREFETCH_CHIMU(A) \ - LOAD64(%r9,A) \ - __asm__ ( \ - VPREFETCHG(12,%r9)\ - VPREFETCHG(13,%r9)\ - VPREFETCHG(14,%r9)\ - VPREFETCHG(15,%r9)\ - VPREFETCHG(16,%r9)\ - VPREFETCHG(17,%r9)\ - VPREFETCHG(18,%r9)\ - VPREFETCHG(19,%r9)\ - VPREFETCHG(20,%r9)\ - VPREFETCHG(21,%r9)\ - VPREFETCHG(22,%r9)\ - VPREFETCHG(23,%r9)); +#define AVX512_PF_L1 +#define AVX512_PF_L2_GAUGE +#define AVX512_PF_L2_TABLE +#undef AVX512_PF_L2_LINEAR -#define PERMUTE_DIR0 __asm__ ( \ +#ifdef AVX512_PF_L2_TABLE +// P1 Fetches the base pointer for next link into L1 with P1 +// M1 Fetches the next site pointer into L2 +#define VPREFETCH_P1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_P2(A,B) +#define VPREFETCH_M1(A,B) VPREFETCH2(A,B) +#define VPREFETCH_M2(A,B) +#endif + +#ifdef AVX512_PF_L2_LINEAR +#define VPREFETCH_M1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_M2(A,B) VPREFETCH2(A,B) +#define VPREFETCH_P1(A,B) +#define VPREFETCH_P2(A,B) +#endif + +#ifdef AVX512_PF_L2_GAUGE +#define VPREFETCH_G1(A,B) VPREFETCH1(A,B) +#define VPREFETCH_G2(A,B) VPREFETCH2(A,B) +#endif + +#define PF_GAUGE(A) \ + LOAD64(%r8,&U._odata[sU](A)) \ + __asm__ ( \ + VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8) \ + VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8) \ + ); + +#define SAVE_RESULTi(PTR,pf) \ + LOAD64(%r8,PTR) \ + LOAD64(%r9,pf) \ + __asm__ ( \ + VSTORE(0,%r8,result_00) VPREFETCH_M1(0,%r9) \ + VSTORE(1,%r8,result_01) VPREFETCH_M1(1,%r9) \ + VSTORE(2,%r8,result_02) VPREFETCH_M1(2,%r9) \ + VSTORE(3,%r8,result_10) VPREFETCH_M1(3,%r9) \ + VSTORE(4,%r8,result_11) VPREFETCH_M1(4,%r9) \ + VSTORE(5,%r8,result_12) VPREFETCH_M1(5,%r9) \ + VSTORE(6,%r8,result_20) VPREFETCH_M1(6,%r9) \ + VSTORE(7,%r8,result_21) VPREFETCH_M1(7,%r9) \ + VSTORE(8,%r8,result_22) VPREFETCH_M1(8,%r9) \ + VSTORE(9,%r8,result_30) VPREFETCH_M1(9,%r9) \ + VSTORE(10,%r8,result_31) VPREFETCH_M1(10,%r9) \ + VSTORE(11,%r8,result_32) VPREFETCH_M1(11,%r9) \ + ); + +#ifdef AVX512_PF_L2_TABLE +#define PREFETCH_CHIMU(A) \ + LOAD64(%r9,A) \ + __asm__ ( \ + VPREFETCH_P1(0,%r9) \ + VPREFETCH_P1(1,%r9) \ + VPREFETCH_P1(2,%r9) \ + VPREFETCH_P1(3,%r9) \ + VPREFETCH_P1(4,%r9) \ + VPREFETCH_P1(5,%r9) \ + VPREFETCH_P1(6,%r9) \ + VPREFETCH_P1(7,%r9) \ + VPREFETCH_P1(8,%r9) \ + VPREFETCH_P1(9,%r9) \ + VPREFETCH_P1(10,%r9) \ + VPREFETCH_P1(11,%r9)); + +#else +#define PREFETCH_CHIMU(A) +#endif + +#define PREFETCH1_CHIMU(A) \ + LOAD64(%r9,A) \ + __asm__ ( \ + VPREFETCH_P1(0,%r9) \ + VPREFETCH_P1(1,%r9) \ + VPREFETCH_P1(2,%r9) \ + VPREFETCH_P1(3,%r9) \ + VPREFETCH_P1(4,%r9) \ + VPREFETCH_P1(5,%r9) \ + VPREFETCH_P1(6,%r9) \ + VPREFETCH_P1(7,%r9) \ + VPREFETCH_P1(8,%r9) \ + VPREFETCH_P1(9,%r9) \ + VPREFETCH_P1(10,%r9) \ + VPREFETCH_P1(11,%r9)); + +#define PERMUTE_DIR0 __asm__ ( \ VPERM0(Chi_00,Chi_00) \ VPERM0(Chi_01,Chi_01) \ VPERM0(Chi_02,Chi_02) \ @@ -612,15 +668,15 @@ Author: paboyle LOAD64(%r8,ptr) \ LOAD64(%r9,pf) \ __asm__ ( \ - VPREFETCH2(9,%r8) \ - VPREFETCH2(10,%r8) \ - VPREFETCH2(11,%r8) \ - VPREFETCH2(12,%r8) \ - VPREFETCH2(13,%r8) \ - VPREFETCH2(14,%r8) \ - VPREFETCH2(15,%r8) \ - VPREFETCH2(16,%r8) \ - VPREFETCH2(17,%r8) \ + VPREFETCH_G2(9,%r8) \ + VPREFETCH_G2(10,%r8) \ + VPREFETCH_G2(11,%r8) \ + VPREFETCH_G2(12,%r8) \ + VPREFETCH_G2(13,%r8) \ + VPREFETCH_G2(14,%r8) \ + VPREFETCH_G2(15,%r8) \ + VPREFETCH_G2(16,%r8) \ + VPREFETCH_G2(17,%r8) \ VSHUF(Chi_00,T1) \ VMOVIDUP(0,%r8,Z0 ) \ VMOVIDUP(3,%r8,Z1 ) \ @@ -632,10 +688,10 @@ Author: paboyle VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \ VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \ VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \ - VPREFETCHG(0,%r9) \ - VPREFETCHG(1,%r9) \ - VPREFETCHG(2,%r9) \ - VPREFETCHG(3,%r9) \ + VPREFETCH_M1(0,%r9) \ + VPREFETCH_M1(1,%r9) \ + VPREFETCH_M1(2,%r9) \ + VPREFETCH_M1(3,%r9) \ /*18*/ \ VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \ VMADDSUB(Z3,Chi_10,UChi_10) \ @@ -643,10 +699,10 @@ Author: paboyle VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \ VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \ VMADDSUB(Z5,Chi_10,UChi_12) \ - VPREFETCHG(4,%r9) \ - VPREFETCHG(5,%r9) \ - VPREFETCHG(6,%r9) \ - VPREFETCHG(7,%r9) \ + VPREFETCH_M1(4,%r9) \ + VPREFETCH_M1(5,%r9) \ + VPREFETCH_M1(6,%r9) \ + VPREFETCH_M1(7,%r9) \ /*28*/ \ VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \ VMADDSUB(Z0,T2,UChi_10) \ @@ -673,15 +729,15 @@ Author: paboyle VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \ VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \ VMADDSUB(Z5,Chi_11,UChi_12) \ - VPREFETCHG(9,%r8) \ - VPREFETCHG(10,%r8) \ - VPREFETCHG(11,%r8) \ - VPREFETCHG(12,%r8) \ - VPREFETCHG(13,%r8) \ - VPREFETCHG(14,%r8) \ - VPREFETCHG(15,%r8) \ - VPREFETCHG(16,%r8) \ - VPREFETCHG(17,%r8) \ + VPREFETCH_M1(9,%r8) \ + VPREFETCH_M1(10,%r8) \ + VPREFETCH_M1(11,%r8) \ + VPREFETCH_M1(12,%r8) \ + VPREFETCH_M1(13,%r8) \ + VPREFETCH_M1(14,%r8) \ + VPREFETCH_M1(15,%r8) \ + VPREFETCH_M1(16,%r8) \ + VPREFETCH_M1(17,%r8) \ /*48*/ \ VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \ VMADDSUB(Z0,T2,UChi_10) \ @@ -689,10 +745,10 @@ Author: paboyle VMADDSUB(Z1,T2,UChi_11) \ VMADDSUB(Z2,T1,UChi_02) \ VMADDSUB(Z2,T2,UChi_12) \ - VPREFETCHG(8,%r9) \ - VPREFETCHG(9,%r9) \ - VPREFETCHG(10,%r9) \ - VPREFETCHG(11,%r9) \ + VPREFETCH_M1(8,%r9) \ + VPREFETCH_M1(9,%r9) \ + VPREFETCH_M1(10,%r9) \ + VPREFETCH_M1(11,%r9) \ /*55*/ \ VMADDSUB(Z3,Chi_02,UChi_00) \ VMADDSUB(Z3,Chi_12,UChi_10) \ @@ -711,56 +767,58 @@ Author: paboyle VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \ VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \ VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \ - VPREFETCHG(0,%r9) \ - VPREFETCHG(1,%r9) \ - VPREFETCHG(2,%r9) \ - VPREFETCHG(3,%r9) \ + VPREFETCH_M1(0,%r9) \ + VPREFETCH_M1(1,%r9) \ + VPREFETCH_M1(2,%r9) \ + VPREFETCH_M1(3,%r9) \ /*8*/ \ VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \ VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \ VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \ VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \ - VPREFETCHG(4,%r9) \ - VPREFETCHG(5,%r9) \ - VPREFETCHG(6,%r9) \ - VPREFETCHG(7,%r9) \ + VPREFETCH_M1(4,%r9) \ + VPREFETCH_M1(5,%r9) \ + VPREFETCH_M1(6,%r9) \ + VPREFETCH_M1(7,%r9) \ /*16*/ \ VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \ VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \ VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \ - VPREFETCHG(8,%r9) \ - VPREFETCHG(9,%r9) \ - VPREFETCHG(10,%r9) \ - VPREFETCHG(11,%r9) \ + VPREFETCH_M1(8,%r9) \ + VPREFETCH_M1(9,%r9) \ + VPREFETCH_M1(10,%r9) \ + VPREFETCH_M1(11,%r9) \ /*22*/ \ VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \ VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \ VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \ VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \ - VPREFETCH2(12,%r9) \ - VPREFETCH2(13,%r9) \ - VPREFETCH2(14,%r9) \ - VPREFETCH2(15,%r9) \ + VPREFETCH_M2(12,%r9) \ + VPREFETCH_M2(13,%r9) \ + VPREFETCH_M2(14,%r9) \ + VPREFETCH_M2(15,%r9) \ /*30*/ \ VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \ VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \ - VPREFETCH2(16,%r9) \ - VPREFETCH2(17,%r9) \ - VPREFETCH2(18,%r9) \ - VPREFETCH2(19,%r9) \ + VPREFETCH_M2(16,%r9) \ + VPREFETCH_M2(17,%r9) \ + VPREFETCH_M2(18,%r9) \ + VPREFETCH_M2(19,%r9) \ VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \ /*36*/ \ VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ - VPREFETCH2(20,%r9) \ - VPREFETCH2(21,%r9) \ - VPREFETCH2(22,%r9) \ - VPREFETCH2(23,%r9) \ - VPREFETCHG(2,%r8) \ - VPREFETCHG(3,%r8) \ - VPREFETCH2(4,%r8) \ - VPREFETCH2(5,%r8) \ + VPREFETCH_M2(20,%r9) \ + VPREFETCH_M2(21,%r9) \ + VPREFETCH_M2(22,%r9) \ + VPREFETCH_M2(23,%r9) \ + VPREFETCH_G1(2,%r8) \ + VPREFETCH_G1(3,%r8) \ + VPREFETCH_G2(4,%r8) \ + VPREFETCH_G2(5,%r8) \ + VPREFETCH_G2(6,%r8) \ + VPREFETCH_G2(7,%r8) \ /*42 insns*/ ); #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \ @@ -793,8 +851,8 @@ Author: paboyle VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \ VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \ VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \ - /* VPREFETCHG(2,%r8)*/ \ - /* VPREFETCHG(3,%r8)*/ \ + /* VPREFETCH1(2,%r8)*/ \ + /* VPREFETCH1(3,%r8)*/ \ /*42 insns*/ ); diff --git a/lib/stencil/.dirstamp b/lib/stencil/.dirstamp new file mode 100644 index 00000000..e69de29b diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc index 7704e08f..c34b5c96 100644 --- a/lib/stencil/Lebesgue.cc +++ b/lib/stencil/Lebesgue.cc @@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid) { grid = _grid; if ( Block[0]==0) ZGraph(); + else if ( Block[1]==0) NoBlocking(); else CartesianBlocking(); } +void LebesgueOrder::NoBlocking(void) +{ + std::cout<oSites();s++){ + _LebesgueReorder.push_back(s); + } +} void LebesgueOrder::CartesianBlocking(void) { _LebesgueReorder.resize(0); - std::cout << GridLogMessage << " CartesianBlocking "; - for(int d=0;d_ndimension; @@ -116,7 +125,8 @@ void LebesgueOrder::IterateI(int ND, void LebesgueOrder::ZGraph(void) { _LebesgueReorder.resize(0); - + + std::cout << GridLogDebug << " Lebesgue order "< Block; + void NoBlocking(void); void CartesianBlocking(void); void IterateO(int ND,int dim, std::vector & xo,