From 2d8b5a8191603df67f4939e1ef9c4774c203683b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 19 May 2015 15:50:47 +0100 Subject: [PATCH] Optimisation... --- lib/qcd/Grid_qcd_wilson_dop.cc | 265 +++++++++++++++++++++++++-------- lib/qcd/Grid_qcd_wilson_dop.h | 2 + scripts/bench_wilson.sh | 9 ++ scripts/wilson.gnu | 7 + 4 files changed, 223 insertions(+), 60 deletions(-) create mode 100755 scripts/bench_wilson.sh create mode 100644 scripts/wilson.gnu diff --git a/lib/qcd/Grid_qcd_wilson_dop.cc b/lib/qcd/Grid_qcd_wilson_dop.cc index 5143daf3..9ef4af0c 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.cc +++ b/lib/qcd/Grid_qcd_wilson_dop.cc @@ -138,16 +138,8 @@ void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out) return ; } -void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag) +void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out) { - assert((dag==0) ||(dag==1)); - - WilsonCompressor compressor(dag); - Stencil.HaloExchange(in,comm_buf,compressor); - -PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - vHalfSpinColourVector tmp; vHalfSpinColourVector chi; vSpinColourVector result; @@ -155,16 +147,14 @@ PARALLEL_FOR_LOOP int offset,local,perm, ptype; // int ss = Stencil._LebesgueReorder[sss]; - int ss = sss; int ssu= ss; // Xp - int idx = (Xp+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; + offset = Stencil._offsets [Xp][ss]; + local = Stencil._is_local[Xp][ss]; + perm = Stencil._permute[Xp][ss]; - ptype = Stencil._permute_type[idx]; + ptype = Stencil._permute_type[Xp]; if ( local && perm ) { spProjXp(tmp,in._odata[offset]); permute(chi,tmp,ptype); @@ -173,16 +163,14 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Xp),&chi()); spReconXp(result,Uchi); // Yp - idx = (Yp+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; - + offset = Stencil._offsets [Yp][ss]; + local = Stencil._is_local[Yp][ss]; + perm = Stencil._permute[Yp][ss]; + ptype = Stencil._permute_type[Yp]; if ( local && perm ) { spProjYp(tmp,in._odata[offset]); permute(chi,tmp,ptype); @@ -191,15 +179,14 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Yp),&chi()); accumReconYp(result,Uchi); // Zp - idx = (Zp+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; + offset = Stencil._offsets [Zp][ss]; + local = Stencil._is_local[Zp][ss]; + perm = Stencil._permute[Zp][ss]; + ptype = Stencil._permute_type[Zp]; if ( local && perm ) { spProjZp(tmp,in._odata[offset]); permute(chi,tmp,ptype); @@ -208,15 +195,14 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Zp),&chi()); accumReconZp(result,Uchi); // Tp - idx = (Tp+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; + offset = Stencil._offsets [Tp][ss]; + local = Stencil._is_local[Tp][ss]; + perm = Stencil._permute[Tp][ss]; + ptype = Stencil._permute_type[Tp]; if ( local && perm ) { spProjTp(tmp,in._odata[offset]); permute(chi,tmp,ptype); @@ -225,15 +211,14 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Tp),&chi()); accumReconTp(result,Uchi); // Xm - idx = (Xm+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; + offset = Stencil._offsets [Xm][ss]; + local = Stencil._is_local[Xm][ss]; + perm = Stencil._permute[Xm][ss]; + ptype = Stencil._permute_type[Xm]; if ( local && perm ) { @@ -244,16 +229,15 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Xm),&chi()); accumReconXm(result,Uchi); // Ym - idx = (Ym+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; + offset = Stencil._offsets [Ym][ss]; + local = Stencil._is_local[Ym][ss]; + perm = Stencil._permute[Ym][ss]; + ptype = Stencil._permute_type[Ym]; if ( local && perm ) { spProjYm(tmp,in._odata[offset]); @@ -263,15 +247,14 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Ym),&chi()); accumReconYm(result,Uchi); // Zm - idx = (Zm+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; + offset = Stencil._offsets [Zm][ss]; + local = Stencil._is_local[Zm][ss]; + perm = Stencil._permute[Zm][ss]; + ptype = Stencil._permute_type[Zm]; if ( local && perm ) { spProjZm(tmp,in._odata[offset]); permute(chi,tmp,ptype); @@ -280,15 +263,14 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Zm),&chi()); accumReconZm(result,Uchi); // Tm - idx = (Tm+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; + offset = Stencil._offsets [Tm][ss]; + local = Stencil._is_local[Tm][ss]; + perm = Stencil._permute[Tm][ss]; + ptype = Stencil._permute_type[Tm]; if ( local && perm ) { spProjTm(tmp,in._odata[offset]); permute(chi,tmp,ptype); @@ -297,12 +279,175 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Tm),&chi()); accumReconTm(result,Uchi); vstream(out._odata[ss],result); - } +} +void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out) +{ + vHalfSpinColourVector tmp; + vHalfSpinColourVector chi; + vSpinColourVector result; + vHalfSpinColourVector Uchi; + int offset,local,perm, ptype; + int ssu= ss; + + // Xp + offset = Stencil._offsets [Xm][ss]; + local = Stencil._is_local[Xm][ss]; + perm = Stencil._permute[Xm][ss]; + + ptype = Stencil._permute_type[Xm]; + if ( local && perm ) { + spProjXp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjXp(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Xm),&chi()); + spReconXp(result,Uchi); + + // Yp + offset = Stencil._offsets [Ym][ss]; + local = Stencil._is_local[Ym][ss]; + perm = Stencil._permute[Ym][ss]; + ptype = Stencil._permute_type[Ym]; + if ( local && perm ) { + spProjYp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjYp(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Ym),&chi()); + accumReconYp(result,Uchi); + + // Zp + offset = Stencil._offsets [Zm][ss]; + local = Stencil._is_local[Zm][ss]; + perm = Stencil._permute[Zm][ss]; + ptype = Stencil._permute_type[Zm]; + if ( local && perm ) { + spProjZp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjZp(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Zm),&chi()); + accumReconZp(result,Uchi); + + // Tp + offset = Stencil._offsets [Tm][ss]; + local = Stencil._is_local[Tm][ss]; + perm = Stencil._permute[Tm][ss]; + ptype = Stencil._permute_type[Tm]; + if ( local && perm ) { + spProjTp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjTp(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Tm),&chi()); + accumReconTp(result,Uchi); + + // Xm + offset = Stencil._offsets [Xp][ss]; + local = Stencil._is_local[Xp][ss]; + perm = Stencil._permute[Xp][ss]; + ptype = Stencil._permute_type[Xp]; + + if ( local && perm ) + { + spProjXm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjXm(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Xp),&chi()); + accumReconXm(result,Uchi); + + + // Ym + offset = Stencil._offsets [Yp][ss]; + local = Stencil._is_local[Yp][ss]; + perm = Stencil._permute[Yp][ss]; + ptype = Stencil._permute_type[Yp]; + + if ( local && perm ) { + spProjYm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjYm(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Yp),&chi()); + accumReconYm(result,Uchi); + + // Zm + offset = Stencil._offsets [Zp][ss]; + local = Stencil._is_local[Zp][ss]; + perm = Stencil._permute[Zp][ss]; + ptype = Stencil._permute_type[Zp]; + if ( local && perm ) { + spProjZm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjZm(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Zp),&chi()); + accumReconZm(result,Uchi); + + // Tm + offset = Stencil._offsets [Tp][ss]; + local = Stencil._is_local[Tp][ss]; + perm = Stencil._permute[Tp][ss]; + ptype = Stencil._permute_type[Tp]; + if ( local && perm ) { + spProjTm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjTm(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Tp),&chi()); + accumReconTm(result,Uchi); + + vstream(out._odata[ss],result); +} + +void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag) +{ + assert((dag==0) ||(dag==1)); + + WilsonCompressor compressor(dag); + Stencil.HaloExchange(in,comm_buf,compressor); + + if ( dag ) { +PARALLEL_FOR_LOOP + for(int sss=0;sssoSites();sss++){ + DhopSiteDag(sss,in,out); + } + } else { +PARALLEL_FOR_LOOP + for(int sss=0;sssoSites();sss++){ + DhopSite(sss,in,out); + } + } } diff --git a/lib/qcd/Grid_qcd_wilson_dop.h b/lib/qcd/Grid_qcd_wilson_dop.h index 1db95b33..8d93a926 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.h +++ b/lib/qcd/Grid_qcd_wilson_dop.h @@ -46,6 +46,8 @@ namespace Grid { // non-hermitian hopping term; half cb or both void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag); + void DhopSite (int ss,const LatticeFermion &in, LatticeFermion &out); + void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out); typedef iScalar > matrix; diff --git a/scripts/bench_wilson.sh b/scripts/bench_wilson.sh new file mode 100755 index 00000000..af73d591 --- /dev/null +++ b/scripts/bench_wilson.sh @@ -0,0 +1,9 @@ +for omp in 1 2 4 +do +echo > wilson.t$omp +for vol in 4.4.4.4 4.4.4.8 4.4.8.8 4.8.8.8 8.8.8.8 8.8.8.16 8.8.16.16 8.16.16.16 +do +perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp | grep mflop | awk '{print $3}'` +echo $vol $perf >> wilson.t$omp +done +done \ No newline at end of file diff --git a/scripts/wilson.gnu b/scripts/wilson.gnu new file mode 100644 index 00000000..69bca5b5 --- /dev/null +++ b/scripts/wilson.gnu @@ -0,0 +1,7 @@ +plot 'wilson.t1' u 2 w l t "AVX1-OMP=1" +replot 'wilson.t2' u 2 w l t "AVX1-OMP=2" +replot 'wilson.t4' u 2 w l t "AVX1-OMP=4" +set terminal 'pdf' +set output 'wilson_clang.pdf' +replot +quit