diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc index 302059a4..94a00903 100644 --- a/benchmarks/Benchmark_dwf_sweep.cc +++ b/benchmarks/Benchmark_dwf_sweep.cc @@ -68,10 +68,12 @@ int main (int argc, char ** argv) std::cout< latt4(4,L); - for(int d=4;d>0;d--){ + for(int d=4;d>dmin;d--){ if ( d<=3 ) latt4[d]*=2; std::cout << GridLogMessage <<"\t"; for(int d=0;d & latt4, int Ls, int threads,int report ) Dw.Dhop(src,result,0); double t1=usecond(); +#ifdef TIMERS_OFF + int ncall =10; +#else int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); +#endif if (ncall < 5 ) exit(0); @@ -297,7 +303,11 @@ void benchsDw(std::vector & latt4, int Ls, int threads, int report ) sDw.Dhop(ssrc,sresult,0); double t1=usecond(); +#ifdef TIMERS_OFF + int ncall =10; +#else int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); +#endif PerformanceCounter Counter(8); Counter.Start(); @@ -340,7 +350,9 @@ void benchsDw(std::vector & latt4, int Ls, int threads, int report ) CounterSdw.Start(); t0=usecond(); for(int i=0;i void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, @@ -80,6 +83,8 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd #undef VMOVRDUP #undef MAYBEPERM #undef MULT_2SPIN +#undef FX +#define FX(A) DWFASM_ ## A #define MAYBEPERM(A,B) #define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) #define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index bd96b7d5..d3e86276 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -1,8 +1,7 @@ { int locala,perma, ptypea; int localb,permb, ptypeb; - int localc,permc, ptypec; - uint64_t basea, baseb, basec; + uint64_t basea, baseb; uint64_t basex; const uint64_t plocal =(uint64_t) & in._odata[0]; @@ -12,22 +11,15 @@ MASK_REGS; for(int site=0;site shuffle and xor the real part sign bit YM_PROJMEM(baseb); @@ -56,7 +47,7 @@ LOAD_CHI(baseb); } { - MULT_2SPIN_DIR_PFYP(Yp,basec); + MULT_2SPIN_DIR_PFYP(Yp,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit YM_RECON_ACCUM; @@ -65,16 +56,15 @@ // Zp //////////////////////////////// baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; - PREFETCH_CHIMU(baseb); - if ( localc ) { + if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - ZM_PROJMEM(basec); - MAYBEPERM(PERMUTE_DIR1,permc); + ZM_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR1,perma); } else { - LOAD_CHI(basec); + LOAD_CHI(basea); } { - MULT_2SPIN_DIR_PFZP(Zp,basea); + MULT_2SPIN_DIR_PFZP(Zp,baseb); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZM_RECON_ACCUM; @@ -82,17 +72,16 @@ //////////////////////////////// // Tp //////////////////////////////// - basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++; - PREFETCH_CHIMU(basec); - if ( locala ) { + basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - TM_PROJMEM(basea); - MAYBEPERM(PERMUTE_DIR0,perma); + TM_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR0,permb); } else { - LOAD_CHI(basea); + LOAD_CHI(baseb); } { - MULT_2SPIN_DIR_PFTP(Tp,baseb); + MULT_2SPIN_DIR_PFTP(Tp,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TM_RECON_ACCUM; @@ -100,17 +89,16 @@ //////////////////////////////// // Xm //////////////////////////////// - basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++; - PREFETCH_CHIMU(basea); - if ( localb ) { + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - XP_PROJMEM(baseb); - MAYBEPERM(PERMUTE_DIR3,permb); + XP_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR3,perma); } else { - LOAD_CHI(baseb); + LOAD_CHI(basea); } { - MULT_2SPIN_DIR_PFXM(Xm,basec); + MULT_2SPIN_DIR_PFXM(Xm,baseb); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit XP_RECON_ACCUM; @@ -118,14 +106,13 @@ //////////////////////////////// // Ym //////////////////////////////// - baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++; - PREFETCH_CHIMU(baseb); - if ( localc ) { + basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; + if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit - YP_PROJMEM(basec); - MAYBEPERM(PERMUTE_DIR2,permc); + YP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR2,permb); } else { - LOAD_CHI(basec); + LOAD_CHI(baseb); } { MULT_2SPIN_DIR_PFYM(Ym,basea); @@ -136,8 +123,7 @@ //////////////////////////////// // Zm //////////////////////////////// - basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++; - PREFETCH_CHIMU(basec); + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; if ( locala ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit ZP_PROJMEM(basea); @@ -155,7 +141,6 @@ // Tm //////////////////////////////// basea = (uint64_t)&out._odata[ss]; - PREFETCH_CHIMU(basea); if ( localb ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_PROJMEM(baseb); @@ -163,16 +148,16 @@ } else { LOAD_CHI(baseb); } + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); { - MULT_2SPIN_DIR_PFTM(Tm,basec); + MULT_2SPIN_DIR_PFTM(Tm,basea); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit TP_RECON_ACCUM; - // PREFETCH_CHIMU(basex); - SAVE_RESULT(&out._odata[ss]); - - } + SAVE_RESULT(&out._odata[ss],baseb); + + } ssU++; } } diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc new file mode 100644 index 00000000..5a3e01f7 --- /dev/null +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc @@ -0,0 +1,187 @@ +{ + int locala,perma, ptypea; + int localb,permb, ptypeb; + int localc,permc, ptypec; + uint64_t basea, baseb, basec; + uint64_t basex; + const uint64_t plocal =(uint64_t) & in._odata[0]; + + // vComplexF isigns[2] = { signs[0], signs[1] }; + vComplexF *isigns = &signs[0]; + + MASK_REGS; + + for(int site=0;site shuffle and xor the real part sign bit + YM_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR2,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFYP(Yp,basec); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YM_RECON_ACCUM; + + //////////////////////////////// + // Zp + //////////////////////////////// + baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + label(FX(ZP) ); + if ( localc ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZM_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR1,permc); + } else { + LOAD_CHI(basec); + } + { + MULT_2SPIN_DIR_PFZP(Zp,basea); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZM_RECON_ACCUM; + + //////////////////////////////// + // Tp + //////////////////////////////// + basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++; + PREFETCH_CHIMU(basec); + label(FX(TP) ); + if ( locala ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TM_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR0,perma); + } else { + LOAD_CHI(basea); + } + { + MULT_2SPIN_DIR_PFTP(Tp,baseb); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TM_RECON_ACCUM; + + //////////////////////////////// + // Xm + //////////////////////////////// + basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++; + PREFETCH_CHIMU(basea); + label(FX(XM) ); + if ( localb ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + XP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR3,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFXM(Xm,basec); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + XP_RECON_ACCUM; + + //////////////////////////////// + // Ym + //////////////////////////////// + baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++; + PREFETCH_CHIMU(baseb); + label(FX(YM) ); + if ( localc ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YP_PROJMEM(basec); + MAYBEPERM(PERMUTE_DIR2,permc); + } else { + LOAD_CHI(basec); + } + { + MULT_2SPIN_DIR_PFYM(Ym,basea); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + YP_RECON_ACCUM; + + //////////////////////////////// + // Zm + //////////////////////////////// + basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++; + PREFETCH_CHIMU(basec); + label(FX(ZM) ); + if ( locala ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZP_PROJMEM(basea); + MAYBEPERM(PERMUTE_DIR1,perma); + } else { + LOAD_CHI(basea); + } + { + MULT_2SPIN_DIR_PFZM(Zm,baseb); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + ZP_RECON_ACCUM; + + //////////////////////////////// + // Tm + //////////////////////////////// + basea = (uint64_t)&out._odata[ss]; + PREFETCH_CHIMU(basea); + label(FX(TM) ); + if ( localb ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TP_PROJMEM(baseb); + MAYBEPERM(PERMUTE_DIR0,permb); + } else { + LOAD_CHI(baseb); + } + { + MULT_2SPIN_DIR_PFTM(Tm,basec); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + TP_RECON_ACCUM; + + // PREFETCH_CHIMU(basex); + label(FX(SAV) ); + SAVE_RESULT(&out._odata[ss]); + + } + ssU++; + } +} diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index a3cd980d..6878bcfb 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -1,4 +1,4 @@ - /************************************************************************************* + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -37,6 +37,8 @@ Author: paboyle "mov $0x5555, %%eax \n"\ "kmovw %%eax, %%k7 \n" : : : "%eax"); +//#define label(B) __asm__ ( __func__ __LINE__ #B ":\n" ); + #define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n" #define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"