1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-14 01:35:36 +00:00

Optimisation...

This commit is contained in:
Peter Boyle 2015-05-19 15:50:47 +01:00
parent 5f0530b68a
commit 46ab8edf30
4 changed files with 223 additions and 60 deletions

View File

@ -138,16 +138,8 @@ void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
return ; return ;
} }
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag) void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out)
{ {
assert((dag==0) ||(dag==1));
WilsonCompressor compressor(dag);
Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
PARALLEL_FOR_LOOP
for(int sss=0;sss<grid->oSites();sss++){
vHalfSpinColourVector tmp; vHalfSpinColourVector tmp;
vHalfSpinColourVector chi; vHalfSpinColourVector chi;
vSpinColourVector result; vSpinColourVector result;
@ -155,16 +147,14 @@ PARALLEL_FOR_LOOP
int offset,local,perm, ptype; int offset,local,perm, ptype;
// int ss = Stencil._LebesgueReorder[sss]; // int ss = Stencil._LebesgueReorder[sss];
int ss = sss;
int ssu= ss; int ssu= ss;
// Xp // Xp
int idx = (Xp+dag*4)%8; offset = Stencil._offsets [Xp][ss];
offset = Stencil._offsets [idx][ss]; local = Stencil._is_local[Xp][ss];
local = Stencil._is_local[idx][ss]; perm = Stencil._permute[Xp][ss];
perm = Stencil._permute[idx][ss];
ptype = Stencil._permute_type[idx]; ptype = Stencil._permute_type[Xp];
if ( local && perm ) { if ( local && perm ) {
spProjXp(tmp,in._odata[offset]); spProjXp(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
@ -173,16 +163,14 @@ PARALLEL_FOR_LOOP
} else { } else {
chi=comm_buf[offset]; chi=comm_buf[offset];
} }
mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
spReconXp(result,Uchi); spReconXp(result,Uchi);
// Yp // Yp
idx = (Yp+dag*4)%8; offset = Stencil._offsets [Yp][ss];
offset = Stencil._offsets [idx][ss]; local = Stencil._is_local[Yp][ss];
local = Stencil._is_local[idx][ss]; perm = Stencil._permute[Yp][ss];
perm = Stencil._permute[idx][ss]; ptype = Stencil._permute_type[Yp];
ptype = Stencil._permute_type[idx];
if ( local && perm ) { if ( local && perm ) {
spProjYp(tmp,in._odata[offset]); spProjYp(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
@ -191,15 +179,14 @@ PARALLEL_FOR_LOOP
} else { } else {
chi=comm_buf[offset]; chi=comm_buf[offset];
} }
mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
accumReconYp(result,Uchi); accumReconYp(result,Uchi);
// Zp // Zp
idx = (Zp+dag*4)%8; offset = Stencil._offsets [Zp][ss];
offset = Stencil._offsets [idx][ss]; local = Stencil._is_local[Zp][ss];
local = Stencil._is_local[idx][ss]; perm = Stencil._permute[Zp][ss];
perm = Stencil._permute[idx][ss]; ptype = Stencil._permute_type[Zp];
ptype = Stencil._permute_type[idx];
if ( local && perm ) { if ( local && perm ) {
spProjZp(tmp,in._odata[offset]); spProjZp(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
@ -208,15 +195,14 @@ PARALLEL_FOR_LOOP
} else { } else {
chi=comm_buf[offset]; chi=comm_buf[offset];
} }
mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
accumReconZp(result,Uchi); accumReconZp(result,Uchi);
// Tp // Tp
idx = (Tp+dag*4)%8; offset = Stencil._offsets [Tp][ss];
offset = Stencil._offsets [idx][ss]; local = Stencil._is_local[Tp][ss];
local = Stencil._is_local[idx][ss]; perm = Stencil._permute[Tp][ss];
perm = Stencil._permute[idx][ss]; ptype = Stencil._permute_type[Tp];
ptype = Stencil._permute_type[idx];
if ( local && perm ) { if ( local && perm ) {
spProjTp(tmp,in._odata[offset]); spProjTp(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
@ -225,15 +211,14 @@ PARALLEL_FOR_LOOP
} else { } else {
chi=comm_buf[offset]; chi=comm_buf[offset];
} }
mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
accumReconTp(result,Uchi); accumReconTp(result,Uchi);
// Xm // Xm
idx = (Xm+dag*4)%8; offset = Stencil._offsets [Xm][ss];
offset = Stencil._offsets [idx][ss]; local = Stencil._is_local[Xm][ss];
local = Stencil._is_local[idx][ss]; perm = Stencil._permute[Xm][ss];
perm = Stencil._permute[idx][ss]; ptype = Stencil._permute_type[Xm];
ptype = Stencil._permute_type[idx];
if ( local && perm ) if ( local && perm )
{ {
@ -244,16 +229,15 @@ PARALLEL_FOR_LOOP
} else { } else {
chi=comm_buf[offset]; chi=comm_buf[offset];
} }
mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
accumReconXm(result,Uchi); accumReconXm(result,Uchi);
// Ym // Ym
idx = (Ym+dag*4)%8; offset = Stencil._offsets [Ym][ss];
offset = Stencil._offsets [idx][ss]; local = Stencil._is_local[Ym][ss];
local = Stencil._is_local[idx][ss]; perm = Stencil._permute[Ym][ss];
perm = Stencil._permute[idx][ss]; ptype = Stencil._permute_type[Ym];
ptype = Stencil._permute_type[idx];
if ( local && perm ) { if ( local && perm ) {
spProjYm(tmp,in._odata[offset]); spProjYm(tmp,in._odata[offset]);
@ -263,15 +247,14 @@ PARALLEL_FOR_LOOP
} else { } else {
chi=comm_buf[offset]; chi=comm_buf[offset];
} }
mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
accumReconYm(result,Uchi); accumReconYm(result,Uchi);
// Zm // Zm
idx = (Zm+dag*4)%8; offset = Stencil._offsets [Zm][ss];
offset = Stencil._offsets [idx][ss]; local = Stencil._is_local[Zm][ss];
local = Stencil._is_local[idx][ss]; perm = Stencil._permute[Zm][ss];
perm = Stencil._permute[idx][ss]; ptype = Stencil._permute_type[Zm];
ptype = Stencil._permute_type[idx];
if ( local && perm ) { if ( local && perm ) {
spProjZm(tmp,in._odata[offset]); spProjZm(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
@ -280,15 +263,14 @@ PARALLEL_FOR_LOOP
} else { } else {
chi=comm_buf[offset]; chi=comm_buf[offset];
} }
mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
accumReconZm(result,Uchi); accumReconZm(result,Uchi);
// Tm // Tm
idx = (Tm+dag*4)%8; offset = Stencil._offsets [Tm][ss];
offset = Stencil._offsets [idx][ss]; local = Stencil._is_local[Tm][ss];
local = Stencil._is_local[idx][ss]; perm = Stencil._permute[Tm][ss];
perm = Stencil._permute[idx][ss]; ptype = Stencil._permute_type[Tm];
ptype = Stencil._permute_type[idx];
if ( local && perm ) { if ( local && perm ) {
spProjTm(tmp,in._odata[offset]); spProjTm(tmp,in._odata[offset]);
permute(chi,tmp,ptype); permute(chi,tmp,ptype);
@ -297,12 +279,175 @@ PARALLEL_FOR_LOOP
} else { } else {
chi=comm_buf[offset]; chi=comm_buf[offset];
} }
mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
accumReconTm(result,Uchi);
vstream(out._odata[ss],result);
}
void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out)
{
vHalfSpinColourVector tmp;
vHalfSpinColourVector chi;
vSpinColourVector result;
vHalfSpinColourVector Uchi;
int offset,local,perm, ptype;
int ssu= ss;
// Xp
offset = Stencil._offsets [Xm][ss];
local = Stencil._is_local[Xm][ss];
perm = Stencil._permute[Xm][ss];
ptype = Stencil._permute_type[Xm];
if ( local && perm ) {
spProjXp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjXp(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
spReconXp(result,Uchi);
// Yp
offset = Stencil._offsets [Ym][ss];
local = Stencil._is_local[Ym][ss];
perm = Stencil._permute[Ym][ss];
ptype = Stencil._permute_type[Ym];
if ( local && perm ) {
spProjYp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjYp(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
accumReconYp(result,Uchi);
// Zp
offset = Stencil._offsets [Zm][ss];
local = Stencil._is_local[Zm][ss];
perm = Stencil._permute[Zm][ss];
ptype = Stencil._permute_type[Zm];
if ( local && perm ) {
spProjZp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjZp(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
accumReconZp(result,Uchi);
// Tp
offset = Stencil._offsets [Tm][ss];
local = Stencil._is_local[Tm][ss];
perm = Stencil._permute[Tm][ss];
ptype = Stencil._permute_type[Tm];
if ( local && perm ) {
spProjTp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjTp(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
accumReconTp(result,Uchi);
// Xm
offset = Stencil._offsets [Xp][ss];
local = Stencil._is_local[Xp][ss];
perm = Stencil._permute[Xp][ss];
ptype = Stencil._permute_type[Xp];
if ( local && perm )
{
spProjXm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjXm(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
accumReconXm(result,Uchi);
// Ym
offset = Stencil._offsets [Yp][ss];
local = Stencil._is_local[Yp][ss];
perm = Stencil._permute[Yp][ss];
ptype = Stencil._permute_type[Yp];
if ( local && perm ) {
spProjYm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjYm(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
accumReconYm(result,Uchi);
// Zm
offset = Stencil._offsets [Zp][ss];
local = Stencil._is_local[Zp][ss];
perm = Stencil._permute[Zp][ss];
ptype = Stencil._permute_type[Zp];
if ( local && perm ) {
spProjZm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjZm(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
accumReconZm(result,Uchi);
// Tm
offset = Stencil._offsets [Tp][ss];
local = Stencil._is_local[Tp][ss];
perm = Stencil._permute[Tp][ss];
ptype = Stencil._permute_type[Tp];
if ( local && perm ) {
spProjTm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjTm(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
accumReconTm(result,Uchi); accumReconTm(result,Uchi);
vstream(out._odata[ss],result); vstream(out._odata[ss],result);
} }
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
{
assert((dag==0) ||(dag==1));
WilsonCompressor compressor(dag);
Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
if ( dag ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<grid->oSites();sss++){
DhopSiteDag(sss,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<grid->oSites();sss++){
DhopSite(sss,in,out);
}
}
} }

View File

@ -46,6 +46,8 @@ namespace Grid {
// non-hermitian hopping term; half cb or both // non-hermitian hopping term; half cb or both
void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag); void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);
void DhopSite (int ss,const LatticeFermion &in, LatticeFermion &out);
void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out);
typedef iScalar<iMatrix<vComplex, Nc> > matrix; typedef iScalar<iMatrix<vComplex, Nc> > matrix;

9
scripts/bench_wilson.sh Executable file
View File

@ -0,0 +1,9 @@
for omp in 1 2 4
do
echo > wilson.t$omp
for vol in 4.4.4.4 4.4.4.8 4.4.8.8 4.8.8.8 8.8.8.8 8.8.8.16 8.8.16.16 8.16.16.16
do
perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp | grep mflop | awk '{print $3}'`
echo $vol $perf >> wilson.t$omp
done
done

7
scripts/wilson.gnu Normal file
View File

@ -0,0 +1,7 @@
plot 'wilson.t1' u 2 w l t "AVX1-OMP=1"
replot 'wilson.t2' u 2 w l t "AVX1-OMP=2"
replot 'wilson.t4' u 2 w l t "AVX1-OMP=4"
set terminal 'pdf'
set output 'wilson_clang.pdf'
replot
quit