1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Enable reordering of the loops in the assembler for cache friendly.

This gets in the way of L2 prefetching however. Do next next link in stencil
prefetching.
This commit is contained in:
paboyle 2016-06-19 11:45:58 -07:00
parent d6737e4bd8
commit 6d58cb2a68
15 changed files with 670 additions and 116 deletions

View File

@ -0,0 +1,358 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid.h>
#include <PerfCount.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
template<class d>
struct scal {
d internal;
};
Gamma::GammaMatrix Gmu [] = {
Gamma::GammaX,
Gamma::GammaY,
Gamma::GammaZ,
Gamma::GammaT
};
void benchDw(std::vector<int> & L, int Ls, int threads, int report =0 );
void benchsDw(std::vector<int> & L, int Ls, int threads, int report=0 );
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
const int Ls=16;
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
if ( getenv("ASMOPT") ) {
QCD::WilsonKernelsStatic::AsmOpt=1;
} else {
QCD::WilsonKernelsStatic::AsmOpt=0;
}
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s) "<<std::endl;
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
int Lmax=32;
if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
for (int L=8;L<Lmax;L*=2){
std::vector<int> latt4(4,L);
for(int d=4;d>0;d--){
if ( d<=3 ) latt4[d]*=2;
std::cout << GridLogMessage <<"\t";
for(int d=0;d<Nd;d++){
std::cout<<latt4[d]<<"x";
}
std::cout <<Ls<<"\t" ;
benchDw (latt4,Ls,threads,0);
benchsDw(latt4,Ls,threads,0);
std::cout<<std::endl;
}
}
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
{
std::vector<int> latt4(4,16);
std::cout<<GridLogMessage << "16^4 Dw miss rate"<<std::endl;
benchDw (latt4,Ls,threads,1);
std::cout<<GridLogMessage << "16^4 sDw miss rate"<<std::endl;
benchsDw(latt4,Ls,threads,1);
}
Grid_finalize();
}
#undef CHECK
void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
{
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
#ifdef CHECK
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
LatticeFermion src (FGrid); random(RNG5,src);
LatticeGaugeField Umu(UGrid);
random(RNG4,Umu);
#else
LatticeFermion src (FGrid); src=zero;
LatticeGaugeField Umu(UGrid); Umu=zero;
#endif
LatticeFermion result(FGrid); result=zero;
LatticeFermion ref(FGrid); ref=zero;
LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid);
ColourMatrix cm = Complex(1.0,0.0);
LatticeGaugeField Umu5d(FGrid);
// replicate across fifth dimension
for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
}
}
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
std::vector<LatticeColourMatrix> U(4,FGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
}
#ifdef CHECK
if (1)
{
ref = zero;
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
#endif
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
double t0=usecond();
Dw.Dhop(src,result,0);
double t1=usecond();
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
if (ncall < 5 ) exit(0);
Dw.Dhop(src,result,0);
PerformanceCounter Counter(8);
Counter.Start();
t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
}
t1=usecond();
Counter.Stop();
if ( report ) {
Counter.Report();
}
if ( ! report )
{
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
}
#ifdef CHECK
err = ref-result;
RealD errd = norm2(err);
if ( errd> 1.0e-4 ) {
std::cout<<GridLogMessage << "oops !!! norm diff "<< norm2(err)<<std::endl;
exit(-1);
}
#endif
LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid);
LatticeFermion r_o (FrbGrid);
LatticeFermion r_eo (FGrid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
{
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
double t1=usecond();
if(!report){
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume*ncall)/2;
std::cout<< flops/(t1-t0);
}
}
}
#undef CHECK_SDW
void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
{
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
#ifdef CHECK_SDW
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
LatticeFermion src (FGrid); random(RNG5,src);
LatticeGaugeField Umu(UGrid);
random(RNG4,Umu);
#else
LatticeFermion src (FGrid); src=zero;
LatticeGaugeField Umu(UGrid); Umu=zero;
#endif
LatticeFermion result(FGrid); result=zero;
LatticeFermion ref(FGrid); ref=zero;
LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid);
ColourMatrix cm = Complex(1.0,0.0);
LatticeGaugeField Umu5d(FGrid);
// replicate across fifth dimension
for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
}
}
RealD mass=0.1;
RealD M5 =1.8;
typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
LatticeFermionF ssrc(sFGrid);
LatticeFermionF sref(sFGrid);
LatticeFermionF sresult(sFGrid);
WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
for(int x=0;x<latt4[0];x++){
for(int y=0;y<latt4[1];y++){
for(int z=0;z<latt4[2];z++){
for(int t=0;t<latt4[3];t++){
for(int s=0;s<Ls;s++){
std::vector<int> site({s,x,y,z,t});
SpinColourVectorF tmp;
peekSite(tmp,src,site);
pokeSite(tmp,ssrc,site);
}}}}}
double t0=usecond();
sDw.Dhop(ssrc,sresult,0);
double t1=usecond();
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
PerformanceCounter Counter(8);
Counter.Start();
t0=usecond();
for(int i=0;i<ncall;i++){
sDw.Dhop(ssrc,sresult,0);
}
t1=usecond();
Counter.Stop();
if ( report ) {
Counter.Report();
} else {
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
std::cout<<"\t"<< flops/(t1-t0);
}
LatticeFermionF sr_eo(sFGrid);
LatticeFermionF serr(sFGrid);
LatticeFermion ssrc_e (sFrbGrid);
LatticeFermion ssrc_o (sFrbGrid);
LatticeFermion sr_e (sFrbGrid);
LatticeFermion sr_o (sFrbGrid);
pickCheckerboard(Even,ssrc_e,ssrc);
pickCheckerboard(Odd,ssrc_o,ssrc);
setCheckerboard(sr_eo,ssrc_o);
setCheckerboard(sr_eo,ssrc_e);
sr_e = zero;
sr_o = zero;
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
PerformanceCounter CounterSdw(8);
CounterSdw.Start();
t0=usecond();
for(int i=0;i<ncall;i++){
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
}
t1=usecond();
CounterSdw.Stop();
if ( report ) {
CounterSdw.Report();
} else {
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume*ncall)/2;
std::cout<<"\t"<< flops/(t1-t0);
}
}

View File

@ -63,7 +63,7 @@ namespace Grid {
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
assert(zdata->n==this->Ls); assert(zdata->n==this->Ls);
std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl; // std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
// Call base setter // Call base setter
this->SetCoefficientsTanh(zdata,1.0,0.0); this->SetCoefficientsTanh(zdata,1.0,0.0);

View File

@ -53,6 +53,8 @@ namespace QCD {
StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
mass(_mass), mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid), Umu(&Fgrid),
UmuEven(&Hgrid), UmuEven(&Hgrid),
UmuOdd (&Hgrid) UmuOdd (&Hgrid)
@ -228,7 +230,7 @@ PARALLEL_FOR_LOOP
out.checkerboard = in.checkerboard; out.checkerboard = in.checkerboard;
DhopInternal(Stencil,Umu,in,out,dag); DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
} }
template<class Impl> template<class Impl>
@ -239,7 +241,7 @@ PARALLEL_FOR_LOOP
assert(in.checkerboard==Even); assert(in.checkerboard==Even);
out.checkerboard = Odd; out.checkerboard = Odd;
DhopInternal(StencilEven,UmuOdd,in,out,dag); DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
} }
template<class Impl> template<class Impl>
@ -250,7 +252,7 @@ PARALLEL_FOR_LOOP
assert(in.checkerboard==Odd); assert(in.checkerboard==Odd);
out.checkerboard = Even; out.checkerboard = Even;
DhopInternal(StencilOdd,UmuEven,in,out,dag); DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
} }
template<class Impl> template<class Impl>
@ -285,7 +287,7 @@ PARALLEL_FOR_LOOP
}; };
template<class Impl> template<class Impl>
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U, void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
assert((dag==DaggerNo) ||(dag==DaggerYes)); assert((dag==DaggerNo) ||(dag==DaggerYes));
@ -296,12 +298,12 @@ PARALLEL_FOR_LOOP
if ( dag == DaggerYes ) { if ( dag == DaggerYes ) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out); Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
} }
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out); Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
} }
} }
}; };

View File

@ -111,7 +111,7 @@ namespace Grid {
const FermionField &B, const FermionField &B,
int dag); int dag);
void DhopInternal(StencilImpl & st,DoubledGaugeField & U, void DhopInternal(StencilImpl & st,LebesgueOrder & lo,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) ; const FermionField &in, FermionField &out,int dag) ;
// Constructor // Constructor
@ -146,6 +146,10 @@ namespace Grid {
DoubledGaugeField Umu; DoubledGaugeField Umu;
DoubledGaugeField UmuEven; DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd; DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
}; };

View File

@ -321,14 +321,14 @@ PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){ for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss; int sU=ss;
int sF=LLs*sU; int sF=LLs*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out); Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
} }
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){ for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss; int sU=ss;
int sF=LLs*sU; int sF=LLs*sU;
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out); Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
} }
} }
} }

View File

@ -38,20 +38,20 @@ template<class Impl>
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {}; WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
template<class Impl> template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out) int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
{ {
if ( AsmOpt ) { if ( AsmOpt ) {
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out); WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
} else { } else {
for(int site=0;site<Ns;site++) { for(int site=0;site<Ns;site++) {
for(int s=0;s<Ls;s++) { for(int s=0;s<Ls;s++) {
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out); if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out); else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
sF++; sF++;
} }
sU++; sU++;
@ -61,17 +61,17 @@ void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
} }
template<class Impl> template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out) int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
{ {
// No asm implementation yet. // No asm implementation yet.
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out); // if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
// else // else
for(int site=0;site<Ns;site++) { for(int site=0;site<Ns;site++) {
for(int s=0;s<Ls;s++) { for(int s=0;s<Ls;s++) {
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out); if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out); else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
sF++; sF++;
} }
sU++; sU++;
@ -84,7 +84,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
//////////////////////////////////////////// ////////////////////////////////////////////
template<class Impl> template<class Impl>
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out)
{ {
@ -262,7 +262,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaug
// Need controls to do interior, exterior, or both // Need controls to do interior, exterior, or both
template<class Impl> template<class Impl>
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out)
{ {

View File

@ -53,11 +53,11 @@ namespace Grid {
public: public:
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U, void DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out); int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, void DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out); int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
@ -67,24 +67,24 @@ namespace Grid {
private: private:
// Specialised variants // Specialised variants
void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U, void DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU, const FermionField &in, FermionField &out); int sF,int sU, const FermionField &in, FermionField &out);
void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, void DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in,FermionField &out); int sF,int sU,const FermionField &in,FermionField &out);
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, void DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out); int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out); int sF,int sU,const FermionField &in, FermionField &out);
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, void DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out); int sF,int sU,const FermionField &in, FermionField &out);
public: public:

View File

@ -39,9 +39,9 @@ namespace QCD {
// Default to no assembler implementation // Default to no assembler implementation
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
template<class Impl> template<class Impl>
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{ {
assert(0); assert(0);
} }
@ -71,9 +71,9 @@ static int signInit = setupSigns();
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
template<> template<>
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef VMOVIDUP #undef VMOVIDUP
@ -85,31 +85,31 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaug
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) #define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
template<> template<>
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out) int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#endif #endif
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out); int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
}} }}

View File

@ -1,7 +1,8 @@
{ {
int locala,perma, ptypea; int locala,perma, ptypea;
int localb,permb, ptypeb; int localb,permb, ptypeb;
uint64_t basea, baseb; int localc,permc, ptypec;
uint64_t basea, baseb, basec;
uint64_t basex; uint64_t basex;
const uint64_t plocal =(uint64_t) & in._odata[0]; const uint64_t plocal =(uint64_t) & in._odata[0];
@ -11,14 +12,22 @@
MASK_REGS; MASK_REGS;
for(int site=0;site<Ns;site++) { for(int site=0;site<Ns;site++) {
int sU=lo.Reorder(ssU);
for(int s=0;s<Ls;s++) { for(int s=0;s<Ls;s++) {
ss =sU*Ls+s;
//////////////////////////////// ////////////////////////////////
// Xp // Xp
//////////////////////////////// ////////////////////////////////
int ent=ss*8;// 2*Ndim int ent=ss*8;// 2*Ndim
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb);
basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
basex = basea; basex = basea;
if ( locala ) { if ( locala ) {
@ -38,6 +47,7 @@
// Yp // Yp
//////////////////////////////// ////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
if ( localb ) { if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_PROJMEM(baseb); YM_PROJMEM(baseb);
@ -46,7 +56,7 @@
LOAD_CHI(baseb); LOAD_CHI(baseb);
} }
{ {
MULT_2SPIN_DIR_PFYP(Yp,basea); MULT_2SPIN_DIR_PFYP(Yp,basec);
} }
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_RECON_ACCUM; YM_RECON_ACCUM;
@ -55,15 +65,16 @@
// Zp // Zp
//////////////////////////////// ////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) { PREFETCH_CHIMU(baseb);
if ( localc ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_PROJMEM(basea); ZM_PROJMEM(basec);
MAYBEPERM(PERMUTE_DIR1,perma); MAYBEPERM(PERMUTE_DIR1,permc);
} else { } else {
LOAD_CHI(basea); LOAD_CHI(basec);
} }
{ {
MULT_2SPIN_DIR_PFZP(Zp,baseb); MULT_2SPIN_DIR_PFZP(Zp,basea);
} }
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_RECON_ACCUM; ZM_RECON_ACCUM;
@ -71,16 +82,17 @@
//////////////////////////////// ////////////////////////////////
// Tp // Tp
//////////////////////////////// ////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
if ( localb ) { PREFETCH_CHIMU(basec);
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_PROJMEM(baseb); TM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR0,permb); MAYBEPERM(PERMUTE_DIR0,perma);
} else { } else {
LOAD_CHI(baseb); LOAD_CHI(basea);
} }
{ {
MULT_2SPIN_DIR_PFTP(Tp,basea); MULT_2SPIN_DIR_PFTP(Tp,baseb);
} }
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_RECON_ACCUM; TM_RECON_ACCUM;
@ -88,16 +100,17 @@
//////////////////////////////// ////////////////////////////////
// Xm // Xm
//////////////////////////////// ////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
if ( locala ) { PREFETCH_CHIMU(basea);
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_PROJMEM(basea); XP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR3,perma); MAYBEPERM(PERMUTE_DIR3,permb);
} else { } else {
LOAD_CHI(basea); LOAD_CHI(baseb);
} }
{ {
MULT_2SPIN_DIR_PFXM(Xm,baseb); MULT_2SPIN_DIR_PFXM(Xm,basec);
} }
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_RECON_ACCUM; XP_RECON_ACCUM;
@ -105,13 +118,14 @@
//////////////////////////////// ////////////////////////////////
// Ym // Ym
//////////////////////////////// ////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
if ( localb ) { PREFETCH_CHIMU(baseb);
if ( localc ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_PROJMEM(baseb); YP_PROJMEM(basec);
MAYBEPERM(PERMUTE_DIR2,permb); MAYBEPERM(PERMUTE_DIR2,permc);
} else { } else {
LOAD_CHI(baseb); LOAD_CHI(basec);
} }
{ {
MULT_2SPIN_DIR_PFYM(Ym,basea); MULT_2SPIN_DIR_PFYM(Ym,basea);
@ -122,7 +136,8 @@
//////////////////////////////// ////////////////////////////////
// Zm // Zm
//////////////////////////////// ////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
if ( locala ) { if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_PROJMEM(basea); ZP_PROJMEM(basea);
@ -140,6 +155,7 @@
// Tm // Tm
//////////////////////////////// ////////////////////////////////
basea = (uint64_t)&out._odata[ss]; basea = (uint64_t)&out._odata[ss];
PREFETCH_CHIMU(basea);
if ( localb ) { if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_PROJMEM(baseb); TP_PROJMEM(baseb);
@ -148,17 +164,15 @@
LOAD_CHI(baseb); LOAD_CHI(baseb);
} }
{ {
MULT_2SPIN_DIR_PFTM(Tm,basea); MULT_2SPIN_DIR_PFTM(Tm,basec);
} }
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_RECON_ACCUM; TP_RECON_ACCUM;
PREFETCH_CHIMU(basex); // PREFETCH_CHIMU(basex);
SAVE_RESULT(&out._odata[ss]); SAVE_RESULT(&out._odata[ss]);
ss++; }
} ssU++;
sU++;
} }
} }

View File

@ -0,0 +1,163 @@
{
int locala,perma, ptypea;
int localb,permb, ptypeb;
uint64_t basea, baseb;
uint64_t basex;
const uint64_t plocal =(uint64_t) & in._odata[0];
// vComplexF isigns[2] = { signs[0], signs[1] };
vComplexF *isigns = &signs[0];
MASK_REGS;
for(int site=0;site<Ns;site++) {
int sU=lo.Reorder(ssU);
for(int s=0;s<Ls;s++) {
ss=sU*Ls+s;
////////////////////////////////
// Xp
////////////////////////////////
int ent=ss*8;// 2*Ndim
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
basex = basea;
if ( locala ) {
LOAD64(%r10,isigns);
XM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR3,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFXP(Xp,baseb);
}
LOAD64(%r10,isigns);
XM_RECON;
////////////////////////////////
// Yp
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR2,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFYP(Yp,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_RECON_ACCUM;
////////////////////////////////
// Zp
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR1,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFZP(Zp,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_RECON_ACCUM;
////////////////////////////////
// Tp
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR0,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFTP(Tp,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_RECON_ACCUM;
////////////////////////////////
// Xm
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR3,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFXM(Xm,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_RECON_ACCUM;
////////////////////////////////
// Ym
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR2,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFYM(Ym,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_RECON_ACCUM;
////////////////////////////////
// Zm
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR1,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFZM(Zm,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_RECON_ACCUM;
////////////////////////////////
// Tm
////////////////////////////////
basea = (uint64_t)&out._odata[ss];
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR0,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFTM(Tm,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_RECON_ACCUM;
SAVE_RESULT(&out._odata[ss]);
}
ssU++;
}
}

View File

@ -312,7 +312,7 @@ namespace QCD {
template<class Impl> template<class Impl>
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out) int ss,int sU,const FermionField &in, FermionField &out)
{ {
@ -555,7 +555,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
} }
template<class Impl> template<class Impl>
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out) int ss,int sU,const FermionField &in, FermionField &out)
{ {
@ -803,7 +803,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
// Specialise Gparity to simple implementation // Specialise Gparity to simple implementation
//////////////////////////////////////////////// ////////////////////////////////////////////////
template<> template<>
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out)
{ {
@ -811,7 +811,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Dou
} }
template<> template<>
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out)
{ {
@ -819,7 +819,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,
} }
template<> template<>
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out)
{ {
@ -827,7 +827,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Dou
} }
template<> template<>
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out)
{ {
@ -839,44 +839,44 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
////////////// Wilson ; uses this implementation ///////////////////// ////////////// Wilson ; uses this implementation /////////////////////
// Need Nc=3 though // // Need Nc=3 though //
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out);
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out);
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out);
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out);
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out);
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out);
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out);
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out);
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out);
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out);
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out);
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out); int ss,int sU,const FermionField &in, FermionField &out);

View File

@ -88,7 +88,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" #define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n"
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
#define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n"
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
#define VPREFETCHNTA(O,A)
#define VPREFETCH(O,A)
#define VEVICT(O,A) #define VEVICT(O,A)
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n" //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
@ -124,8 +128,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri) #define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri) #define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
#define VPREFETCHNTA(O,A)
#define VPREFETCH(O,A)
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"

View File

@ -559,22 +559,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VSUB(UChi_02,result_22,result_22)\ VSUB(UChi_02,result_22,result_22)\
VSUB(UChi_12,result_32,result_32) ); VSUB(UChi_12,result_32,result_32) );
#define PREFETCH_CHIMU(A) \ #define PREFETCH_CHIMU(A)
/*
LOAD64(%r9,A) \ LOAD64(%r9,A) \
__asm__ ( \ __asm__ ( \
VPREFETCHG(12,%r9)\ VPREFETCHG(0,%r9)\
VPREFETCHG(13,%r9)\ VPREFETCHG(1,%r9)\
VPREFETCHG(14,%r9)\ VPREFETCHG(2,%r9)\
VPREFETCHG(15,%r9)\ VPREFETCHG(3,%r9)\
VPREFETCHG(16,%r9)\ VPREFETCHG(4,%r9)\
VPREFETCHG(17,%r9)\ VPREFETCHG(5,%r9)\
VPREFETCHG(18,%r9)\ VPREFETCHG(6,%r9)\
VPREFETCHG(19,%r9)\ VPREFETCHG(7,%r9)\
VPREFETCHG(20,%r9)\ VPREFETCHG(8,%r9)\
VPREFETCHG(21,%r9)\ VPREFETCHG(9,%r9)\
VPREFETCHG(22,%r9)\ VPREFETCHG(10,%r9)\
VPREFETCHG(23,%r9)); VPREFETCHG(11,%r9));
*/
#define PERMUTE_DIR0 __asm__ ( \ #define PERMUTE_DIR0 __asm__ ( \
VPERM0(Chi_00,Chi_00) \ VPERM0(Chi_00,Chi_00) \
VPERM0(Chi_01,Chi_01) \ VPERM0(Chi_01,Chi_01) \
@ -612,8 +613,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
LOAD64(%r8,ptr) \ LOAD64(%r8,ptr) \
LOAD64(%r9,pf) \ LOAD64(%r9,pf) \
__asm__ ( \ __asm__ ( \
VPREFETCH2(9,%r8) \ VPREFETCH2(9,%r8) VPREFETCH2(10,%r8) \
VPREFETCH2(10,%r8) \
VPREFETCH2(11,%r8) \ VPREFETCH2(11,%r8) \
VPREFETCH2(12,%r8) \ VPREFETCH2(12,%r8) \
VPREFETCH2(13,%r8) \ VPREFETCH2(13,%r8) \

View File

@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
{ {
grid = _grid; grid = _grid;
if ( Block[0]==0) ZGraph(); if ( Block[0]==0) ZGraph();
else if ( Block[1]==0) NoBlocking();
else CartesianBlocking(); else CartesianBlocking();
} }
void LebesgueOrder::NoBlocking(void)
{
std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
_LebesgueReorder.resize(0);
for ( int s = 0 ; s!= grid->oSites();s++){
_LebesgueReorder.push_back(s);
}
}
void LebesgueOrder::CartesianBlocking(void) void LebesgueOrder::CartesianBlocking(void)
{ {
_LebesgueReorder.resize(0); _LebesgueReorder.resize(0);
std::cout << GridLogMessage << " CartesianBlocking "; std::cout << GridLogDebug << " CartesianBlocking ";
for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" "; // for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
std::cout<<std::endl; // std::cout<<std::endl;
IndexInteger ND = grid->_ndimension; IndexInteger ND = grid->_ndimension;
@ -116,7 +125,8 @@ void LebesgueOrder::IterateI(int ND,
void LebesgueOrder::ZGraph(void) void LebesgueOrder::ZGraph(void)
{ {
_LebesgueReorder.resize(0); _LebesgueReorder.resize(0);
std::cout << GridLogDebug << " Lebesgue order "<<std::endl;
// Align up dimensions to power of two. // Align up dimensions to power of two.
const IndexInteger one=1; const IndexInteger one=1;

View File

@ -59,6 +59,7 @@ namespace Grid {
// Cartesian stencil blocking strategy // Cartesian stencil blocking strategy
///////////////////////////////// /////////////////////////////////
static std::vector<int> Block; static std::vector<int> Block;
void NoBlocking(void);
void CartesianBlocking(void); void CartesianBlocking(void);
void IterateO(int ND,int dim, void IterateO(int ND,int dim,
std::vector<IndexInteger> & xo, std::vector<IndexInteger> & xo,