mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-03 18:55:56 +01:00
Enable reordering of the loops in the assembler for cache friendly.
This gets in the way of L2 prefetching however. Do next next link in stencil prefetching.
This commit is contained in:
parent
d6737e4bd8
commit
6d58cb2a68
358
benchmarks/Benchmark_dwf_sweep.cc
Normal file
358
benchmarks/Benchmark_dwf_sweep.cc
Normal file
@ -0,0 +1,358 @@
|
||||
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
#include <PerfCount.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
using namespace Grid::QCD;
|
||||
|
||||
template<class d>
|
||||
struct scal {
|
||||
d internal;
|
||||
};
|
||||
|
||||
Gamma::GammaMatrix Gmu [] = {
|
||||
Gamma::GammaX,
|
||||
Gamma::GammaY,
|
||||
Gamma::GammaZ,
|
||||
Gamma::GammaT
|
||||
};
|
||||
|
||||
void benchDw(std::vector<int> & L, int Ls, int threads, int report =0 );
|
||||
void benchsDw(std::vector<int> & L, int Ls, int threads, int report=0 );
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
const int Ls=16;
|
||||
int threads = GridThread::GetThreads();
|
||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||
|
||||
if ( getenv("ASMOPT") ) {
|
||||
QCD::WilsonKernelsStatic::AsmOpt=1;
|
||||
} else {
|
||||
QCD::WilsonKernelsStatic::AsmOpt=0;
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s) "<<std::endl;
|
||||
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||
|
||||
int Lmax=32;
|
||||
if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
|
||||
for (int L=8;L<Lmax;L*=2){
|
||||
std::vector<int> latt4(4,L);
|
||||
for(int d=4;d>0;d--){
|
||||
if ( d<=3 ) latt4[d]*=2;
|
||||
std::cout << GridLogMessage <<"\t";
|
||||
for(int d=0;d<Nd;d++){
|
||||
std::cout<<latt4[d]<<"x";
|
||||
}
|
||||
std::cout <<Ls<<"\t" ;
|
||||
benchDw (latt4,Ls,threads,0);
|
||||
benchsDw(latt4,Ls,threads,0);
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
}
|
||||
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||
{
|
||||
std::vector<int> latt4(4,16);
|
||||
std::cout<<GridLogMessage << "16^4 Dw miss rate"<<std::endl;
|
||||
benchDw (latt4,Ls,threads,1);
|
||||
std::cout<<GridLogMessage << "16^4 sDw miss rate"<<std::endl;
|
||||
benchsDw(latt4,Ls,threads,1);
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
||||
#undef CHECK
|
||||
|
||||
void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
|
||||
{
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
#ifdef CHECK
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||
LatticeFermion src (FGrid); random(RNG5,src);
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
random(RNG4,Umu);
|
||||
#else
|
||||
LatticeFermion src (FGrid); src=zero;
|
||||
LatticeGaugeField Umu(UGrid); Umu=zero;
|
||||
#endif
|
||||
|
||||
LatticeFermion result(FGrid); result=zero;
|
||||
LatticeFermion ref(FGrid); ref=zero;
|
||||
LatticeFermion tmp(FGrid);
|
||||
LatticeFermion err(FGrid);
|
||||
|
||||
ColourMatrix cm = Complex(1.0,0.0);
|
||||
|
||||
|
||||
LatticeGaugeField Umu5d(FGrid);
|
||||
|
||||
// replicate across fifth dimension
|
||||
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////
|
||||
// Naive wilson implementation
|
||||
////////////////////////////////////
|
||||
std::vector<LatticeColourMatrix> U(4,FGrid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
||||
}
|
||||
|
||||
#ifdef CHECK
|
||||
if (1)
|
||||
{
|
||||
ref = zero;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||
|
||||
tmp =adj(U[mu])*src;
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||
}
|
||||
ref = -0.5*ref;
|
||||
}
|
||||
#endif
|
||||
|
||||
RealD mass=0.1;
|
||||
RealD M5 =1.8;
|
||||
RealD NP = UGrid->_Nprocessors;
|
||||
|
||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||
|
||||
double t0=usecond();
|
||||
Dw.Dhop(src,result,0);
|
||||
double t1=usecond();
|
||||
|
||||
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
|
||||
|
||||
if (ncall < 5 ) exit(0);
|
||||
|
||||
Dw.Dhop(src,result,0);
|
||||
|
||||
PerformanceCounter Counter(8);
|
||||
Counter.Start();
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.Dhop(src,result,0);
|
||||
}
|
||||
t1=usecond();
|
||||
Counter.Stop();
|
||||
if ( report ) {
|
||||
Counter.Report();
|
||||
}
|
||||
|
||||
if ( ! report )
|
||||
{
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=1344*volume*ncall;
|
||||
std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
|
||||
}
|
||||
|
||||
#ifdef CHECK
|
||||
err = ref-result;
|
||||
RealD errd = norm2(err);
|
||||
if ( errd> 1.0e-4 ) {
|
||||
std::cout<<GridLogMessage << "oops !!! norm diff "<< norm2(err)<<std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
LatticeFermion src_e (FrbGrid);
|
||||
LatticeFermion src_o (FrbGrid);
|
||||
LatticeFermion r_e (FrbGrid);
|
||||
LatticeFermion r_o (FrbGrid);
|
||||
LatticeFermion r_eo (FGrid);
|
||||
|
||||
pickCheckerboard(Even,src_e,src);
|
||||
pickCheckerboard(Odd,src_o,src);
|
||||
|
||||
{
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
}
|
||||
double t1=usecond();
|
||||
|
||||
if(!report){
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=(1344.0*volume*ncall)/2;
|
||||
std::cout<< flops/(t1-t0);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#undef CHECK_SDW
|
||||
void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
|
||||
{
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
|
||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
#ifdef CHECK_SDW
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||
LatticeFermion src (FGrid); random(RNG5,src);
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
random(RNG4,Umu);
|
||||
#else
|
||||
LatticeFermion src (FGrid); src=zero;
|
||||
LatticeGaugeField Umu(UGrid); Umu=zero;
|
||||
#endif
|
||||
|
||||
LatticeFermion result(FGrid); result=zero;
|
||||
LatticeFermion ref(FGrid); ref=zero;
|
||||
LatticeFermion tmp(FGrid);
|
||||
LatticeFermion err(FGrid);
|
||||
|
||||
ColourMatrix cm = Complex(1.0,0.0);
|
||||
|
||||
LatticeGaugeField Umu5d(FGrid);
|
||||
|
||||
// replicate across fifth dimension
|
||||
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
RealD mass=0.1;
|
||||
RealD M5 =1.8;
|
||||
|
||||
typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
|
||||
LatticeFermionF ssrc(sFGrid);
|
||||
LatticeFermionF sref(sFGrid);
|
||||
LatticeFermionF sresult(sFGrid);
|
||||
WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
|
||||
|
||||
for(int x=0;x<latt4[0];x++){
|
||||
for(int y=0;y<latt4[1];y++){
|
||||
for(int z=0;z<latt4[2];z++){
|
||||
for(int t=0;t<latt4[3];t++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
std::vector<int> site({s,x,y,z,t});
|
||||
SpinColourVectorF tmp;
|
||||
peekSite(tmp,src,site);
|
||||
pokeSite(tmp,ssrc,site);
|
||||
}}}}}
|
||||
|
||||
double t0=usecond();
|
||||
sDw.Dhop(ssrc,sresult,0);
|
||||
double t1=usecond();
|
||||
|
||||
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
|
||||
|
||||
PerformanceCounter Counter(8);
|
||||
Counter.Start();
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
sDw.Dhop(ssrc,sresult,0);
|
||||
}
|
||||
t1=usecond();
|
||||
Counter.Stop();
|
||||
|
||||
if ( report ) {
|
||||
Counter.Report();
|
||||
} else {
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=1344*volume*ncall;
|
||||
std::cout<<"\t"<< flops/(t1-t0);
|
||||
}
|
||||
|
||||
|
||||
LatticeFermionF sr_eo(sFGrid);
|
||||
LatticeFermionF serr(sFGrid);
|
||||
|
||||
LatticeFermion ssrc_e (sFrbGrid);
|
||||
LatticeFermion ssrc_o (sFrbGrid);
|
||||
LatticeFermion sr_e (sFrbGrid);
|
||||
LatticeFermion sr_o (sFrbGrid);
|
||||
|
||||
pickCheckerboard(Even,ssrc_e,ssrc);
|
||||
pickCheckerboard(Odd,ssrc_o,ssrc);
|
||||
|
||||
setCheckerboard(sr_eo,ssrc_o);
|
||||
setCheckerboard(sr_eo,ssrc_e);
|
||||
|
||||
sr_e = zero;
|
||||
sr_o = zero;
|
||||
|
||||
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||
PerformanceCounter CounterSdw(8);
|
||||
CounterSdw.Start();
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||
}
|
||||
t1=usecond();
|
||||
CounterSdw.Stop();
|
||||
|
||||
if ( report ) {
|
||||
CounterSdw.Report();
|
||||
} else {
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=(1344.0*volume*ncall)/2;
|
||||
std::cout<<"\t"<< flops/(t1-t0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -63,7 +63,7 @@ namespace Grid {
|
||||
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
|
||||
assert(zdata->n==this->Ls);
|
||||
|
||||
std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
|
||||
// std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
|
||||
// Call base setter
|
||||
this->SetCoefficientsTanh(zdata,1.0,0.0);
|
||||
|
||||
|
@ -53,6 +53,8 @@ namespace QCD {
|
||||
StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
|
||||
StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
|
||||
mass(_mass),
|
||||
Lebesgue(_grid),
|
||||
LebesgueEvenOdd(_cbgrid),
|
||||
Umu(&Fgrid),
|
||||
UmuEven(&Hgrid),
|
||||
UmuOdd (&Hgrid)
|
||||
@ -228,7 +230,7 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
out.checkerboard = in.checkerboard;
|
||||
|
||||
DhopInternal(Stencil,Umu,in,out,dag);
|
||||
DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -239,7 +241,7 @@ PARALLEL_FOR_LOOP
|
||||
assert(in.checkerboard==Even);
|
||||
out.checkerboard = Odd;
|
||||
|
||||
DhopInternal(StencilEven,UmuOdd,in,out,dag);
|
||||
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -250,7 +252,7 @@ PARALLEL_FOR_LOOP
|
||||
assert(in.checkerboard==Odd);
|
||||
out.checkerboard = Even;
|
||||
|
||||
DhopInternal(StencilOdd,UmuEven,in,out,dag);
|
||||
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -285,7 +287,7 @@ PARALLEL_FOR_LOOP
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
@ -296,12 +298,12 @@ PARALLEL_FOR_LOOP
|
||||
if ( dag == DaggerYes ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||
Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -111,7 +111,7 @@ namespace Grid {
|
||||
const FermionField &B,
|
||||
int dag);
|
||||
|
||||
void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
||||
void DhopInternal(StencilImpl & st,LebesgueOrder & lo,DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag) ;
|
||||
|
||||
// Constructor
|
||||
@ -146,6 +146,10 @@ namespace Grid {
|
||||
DoubledGaugeField Umu;
|
||||
DoubledGaugeField UmuEven;
|
||||
DoubledGaugeField UmuOdd;
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
@ -321,14 +321,14 @@ PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
int sF=LLs*sU;
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||
Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
int sF=LLs*sU;
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -38,20 +38,20 @@ template<class Impl>
|
||||
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||
{
|
||||
if ( AsmOpt ) {
|
||||
|
||||
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
|
||||
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||
|
||||
} else {
|
||||
|
||||
for(int site=0;site<Ns;site++) {
|
||||
for(int s=0;s<Ls;s++) {
|
||||
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
|
||||
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
|
||||
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||
sF++;
|
||||
}
|
||||
sU++;
|
||||
@ -61,17 +61,17 @@ void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||
{
|
||||
// No asm implementation yet.
|
||||
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
|
||||
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||
// else
|
||||
for(int site=0;site<Ns;site++) {
|
||||
for(int s=0;s<Ls;s++) {
|
||||
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
|
||||
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
|
||||
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||
sF++;
|
||||
}
|
||||
sU++;
|
||||
@ -84,7 +84,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
|
||||
////////////////////////////////////////////
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
@ -262,7 +262,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaug
|
||||
|
||||
// Need controls to do interior, exterior, or both
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
|
@ -53,11 +53,11 @@ namespace Grid {
|
||||
|
||||
public:
|
||||
|
||||
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
void DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||
|
||||
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
void DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
|
||||
|
||||
@ -67,24 +67,24 @@ namespace Grid {
|
||||
|
||||
private:
|
||||
// Specialised variants
|
||||
void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
void DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU, const FermionField &in, FermionField &out);
|
||||
|
||||
void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
void DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in,FermionField &out);
|
||||
|
||||
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
void DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||
|
||||
|
||||
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
void DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out);
|
||||
public:
|
||||
|
@ -39,9 +39,9 @@ namespace QCD {
|
||||
// Default to no assembler implementation
|
||||
///////////////////////////////////////////////////////////
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
@ -71,9 +71,9 @@ static int signInit = setupSigns();
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||
|
||||
template<>
|
||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef VMOVIDUP
|
||||
@ -85,31 +85,31 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaug
|
||||
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||
template<>
|
||||
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#endif
|
||||
|
||||
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||
|
||||
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||
}}
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
{
|
||||
int locala,perma, ptypea;
|
||||
int localb,permb, ptypeb;
|
||||
uint64_t basea, baseb;
|
||||
int localc,permc, ptypec;
|
||||
uint64_t basea, baseb, basec;
|
||||
uint64_t basex;
|
||||
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||
|
||||
@ -11,14 +12,22 @@
|
||||
MASK_REGS;
|
||||
|
||||
for(int site=0;site<Ns;site++) {
|
||||
int sU=lo.Reorder(ssU);
|
||||
|
||||
for(int s=0;s<Ls;s++) {
|
||||
ss =sU*Ls+s;
|
||||
|
||||
////////////////////////////////
|
||||
// Xp
|
||||
////////////////////////////////
|
||||
int ent=ss*8;// 2*Ndim
|
||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basea);
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(baseb);
|
||||
basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basec);
|
||||
|
||||
basex = basea;
|
||||
|
||||
if ( locala ) {
|
||||
@ -38,6 +47,7 @@
|
||||
// Yp
|
||||
////////////////////////////////
|
||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basea);
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YM_PROJMEM(baseb);
|
||||
@ -46,7 +56,7 @@
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYP(Yp,basea);
|
||||
MULT_2SPIN_DIR_PFYP(Yp,basec);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YM_RECON_ACCUM;
|
||||
@ -55,15 +65,16 @@
|
||||
// Zp
|
||||
////////////////////////////////
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||
if ( locala ) {
|
||||
PREFETCH_CHIMU(baseb);
|
||||
if ( localc ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZM_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||
ZM_PROJMEM(basec);
|
||||
MAYBEPERM(PERMUTE_DIR1,permc);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
LOAD_CHI(basec);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZP(Zp,baseb);
|
||||
MULT_2SPIN_DIR_PFZP(Zp,basea);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZM_RECON_ACCUM;
|
||||
@ -71,16 +82,17 @@
|
||||
////////////////////////////////
|
||||
// Tp
|
||||
////////////////////////////////
|
||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||
if ( localb ) {
|
||||
basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basec);
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TM_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||
TM_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR0,perma);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
LOAD_CHI(basea);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTP(Tp,basea);
|
||||
MULT_2SPIN_DIR_PFTP(Tp,baseb);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TM_RECON_ACCUM;
|
||||
@ -88,16 +100,17 @@
|
||||
////////////////////////////////
|
||||
// Xm
|
||||
////////////////////////////////
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||
if ( locala ) {
|
||||
basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basea);
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
XP_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||
XP_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR3,permb);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXM(Xm,baseb);
|
||||
MULT_2SPIN_DIR_PFXM(Xm,basec);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
XP_RECON_ACCUM;
|
||||
@ -105,13 +118,14 @@
|
||||
////////////////////////////////
|
||||
// Ym
|
||||
////////////////////////////////
|
||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||
if ( localb ) {
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(baseb);
|
||||
if ( localc ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YP_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||
YP_PROJMEM(basec);
|
||||
MAYBEPERM(PERMUTE_DIR2,permc);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
LOAD_CHI(basec);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
||||
@ -122,7 +136,8 @@
|
||||
////////////////////////////////
|
||||
// Zm
|
||||
////////////////////////////////
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||
basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basec);
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZP_PROJMEM(basea);
|
||||
@ -140,6 +155,7 @@
|
||||
// Tm
|
||||
////////////////////////////////
|
||||
basea = (uint64_t)&out._odata[ss];
|
||||
PREFETCH_CHIMU(basea);
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TP_PROJMEM(baseb);
|
||||
@ -148,17 +164,15 @@
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTM(Tm,basea);
|
||||
MULT_2SPIN_DIR_PFTM(Tm,basec);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TP_RECON_ACCUM;
|
||||
|
||||
PREFETCH_CHIMU(basex);
|
||||
// PREFETCH_CHIMU(basex);
|
||||
SAVE_RESULT(&out._odata[ss]);
|
||||
|
||||
|
||||
ss++;
|
||||
}
|
||||
sU++;
|
||||
}
|
||||
ssU++;
|
||||
}
|
||||
}
|
||||
|
163
lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
Normal file
163
lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
Normal file
@ -0,0 +1,163 @@
|
||||
{
|
||||
int locala,perma, ptypea;
|
||||
int localb,permb, ptypeb;
|
||||
uint64_t basea, baseb;
|
||||
uint64_t basex;
|
||||
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||
|
||||
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||
vComplexF *isigns = &signs[0];
|
||||
|
||||
MASK_REGS;
|
||||
|
||||
for(int site=0;site<Ns;site++) {
|
||||
int sU=lo.Reorder(ssU);
|
||||
for(int s=0;s<Ls;s++) {
|
||||
ss=sU*Ls+s;
|
||||
////////////////////////////////
|
||||
// Xp
|
||||
////////////////////////////////
|
||||
int ent=ss*8;// 2*Ndim
|
||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basea);
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||
basex = basea;
|
||||
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns);
|
||||
XM_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXP(Xp,baseb);
|
||||
}
|
||||
LOAD64(%r10,isigns);
|
||||
XM_RECON;
|
||||
|
||||
////////////////////////////////
|
||||
// Yp
|
||||
////////////////////////////////
|
||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YM_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYP(Yp,basea);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YM_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Zp
|
||||
////////////////////////////////
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZM_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZP(Zp,baseb);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZM_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Tp
|
||||
////////////////////////////////
|
||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TM_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTP(Tp,basea);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TM_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Xm
|
||||
////////////////////////////////
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
XP_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXM(Xm,baseb);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
XP_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Ym
|
||||
////////////////////////////////
|
||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YP_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YP_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Zm
|
||||
////////////////////////////////
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZP_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZM(Zm,baseb);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZP_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Tm
|
||||
////////////////////////////////
|
||||
basea = (uint64_t)&out._odata[ss];
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TP_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTM(Tm,basea);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TP_RECON_ACCUM;
|
||||
|
||||
SAVE_RESULT(&out._odata[ss]);
|
||||
|
||||
}
|
||||
ssU++;
|
||||
}
|
||||
}
|
@ -312,7 +312,7 @@ namespace QCD {
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
@ -555,7 +555,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
@ -803,7 +803,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
||||
// Specialise Gparity to simple implementation
|
||||
////////////////////////////////////////////////
|
||||
template<>
|
||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
@ -811,7 +811,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Dou
|
||||
}
|
||||
|
||||
template<>
|
||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
@ -819,7 +819,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,
|
||||
}
|
||||
|
||||
template<>
|
||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
@ -827,7 +827,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Dou
|
||||
}
|
||||
|
||||
template<>
|
||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
@ -839,44 +839,44 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
|
||||
////////////// Wilson ; uses this implementation /////////////////////
|
||||
// Need Nc=3 though //
|
||||
|
||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
|
||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
|
||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
|
@ -88,7 +88,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n"
|
||||
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
|
||||
#define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n"
|
||||
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
|
||||
#define VPREFETCHNTA(O,A)
|
||||
#define VPREFETCH(O,A)
|
||||
|
||||
#define VEVICT(O,A)
|
||||
|
||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
||||
@ -124,8 +128,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||
|
||||
#define VPREFETCHNTA(O,A)
|
||||
#define VPREFETCH(O,A)
|
||||
|
||||
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
|
@ -559,22 +559,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VSUB(UChi_02,result_22,result_22)\
|
||||
VSUB(UChi_12,result_32,result_32) );
|
||||
|
||||
#define PREFETCH_CHIMU(A) \
|
||||
#define PREFETCH_CHIMU(A)
|
||||
/*
|
||||
LOAD64(%r9,A) \
|
||||
__asm__ ( \
|
||||
VPREFETCHG(12,%r9)\
|
||||
VPREFETCHG(13,%r9)\
|
||||
VPREFETCHG(14,%r9)\
|
||||
VPREFETCHG(15,%r9)\
|
||||
VPREFETCHG(16,%r9)\
|
||||
VPREFETCHG(17,%r9)\
|
||||
VPREFETCHG(18,%r9)\
|
||||
VPREFETCHG(19,%r9)\
|
||||
VPREFETCHG(20,%r9)\
|
||||
VPREFETCHG(21,%r9)\
|
||||
VPREFETCHG(22,%r9)\
|
||||
VPREFETCHG(23,%r9));
|
||||
|
||||
VPREFETCHG(0,%r9)\
|
||||
VPREFETCHG(1,%r9)\
|
||||
VPREFETCHG(2,%r9)\
|
||||
VPREFETCHG(3,%r9)\
|
||||
VPREFETCHG(4,%r9)\
|
||||
VPREFETCHG(5,%r9)\
|
||||
VPREFETCHG(6,%r9)\
|
||||
VPREFETCHG(7,%r9)\
|
||||
VPREFETCHG(8,%r9)\
|
||||
VPREFETCHG(9,%r9)\
|
||||
VPREFETCHG(10,%r9)\
|
||||
VPREFETCHG(11,%r9));
|
||||
*/
|
||||
#define PERMUTE_DIR0 __asm__ ( \
|
||||
VPERM0(Chi_00,Chi_00) \
|
||||
VPERM0(Chi_01,Chi_01) \
|
||||
@ -612,8 +613,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
__asm__ ( \
|
||||
VPREFETCH2(9,%r8) \
|
||||
VPREFETCH2(10,%r8) \
|
||||
VPREFETCH2(9,%r8) VPREFETCH2(10,%r8) \
|
||||
VPREFETCH2(11,%r8) \
|
||||
VPREFETCH2(12,%r8) \
|
||||
VPREFETCH2(13,%r8) \
|
||||
|
@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
|
||||
{
|
||||
grid = _grid;
|
||||
if ( Block[0]==0) ZGraph();
|
||||
else if ( Block[1]==0) NoBlocking();
|
||||
else CartesianBlocking();
|
||||
}
|
||||
|
||||
void LebesgueOrder::NoBlocking(void)
|
||||
{
|
||||
std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
|
||||
_LebesgueReorder.resize(0);
|
||||
for ( int s = 0 ; s!= grid->oSites();s++){
|
||||
_LebesgueReorder.push_back(s);
|
||||
}
|
||||
}
|
||||
void LebesgueOrder::CartesianBlocking(void)
|
||||
{
|
||||
_LebesgueReorder.resize(0);
|
||||
|
||||
std::cout << GridLogMessage << " CartesianBlocking ";
|
||||
for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
|
||||
std::cout<<std::endl;
|
||||
std::cout << GridLogDebug << " CartesianBlocking ";
|
||||
// for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
|
||||
// std::cout<<std::endl;
|
||||
|
||||
IndexInteger ND = grid->_ndimension;
|
||||
|
||||
@ -116,7 +125,8 @@ void LebesgueOrder::IterateI(int ND,
|
||||
void LebesgueOrder::ZGraph(void)
|
||||
{
|
||||
_LebesgueReorder.resize(0);
|
||||
|
||||
|
||||
std::cout << GridLogDebug << " Lebesgue order "<<std::endl;
|
||||
// Align up dimensions to power of two.
|
||||
const IndexInteger one=1;
|
||||
|
||||
|
@ -59,6 +59,7 @@ namespace Grid {
|
||||
// Cartesian stencil blocking strategy
|
||||
/////////////////////////////////
|
||||
static std::vector<int> Block;
|
||||
void NoBlocking(void);
|
||||
void CartesianBlocking(void);
|
||||
void IterateO(int ND,int dim,
|
||||
std::vector<IndexInteger> & xo,
|
||||
|
Loading…
x
Reference in New Issue
Block a user