mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Enable reordering of the loops in the assembler for cache friendly.
This gets in the way of L2 prefetching however. Do next next link in stencil prefetching.
This commit is contained in:
parent
d6737e4bd8
commit
6d58cb2a68
358
benchmarks/Benchmark_dwf_sweep.cc
Normal file
358
benchmarks/Benchmark_dwf_sweep.cc
Normal file
@ -0,0 +1,358 @@
|
|||||||
|
|
||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid.h>
|
||||||
|
#include <PerfCount.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
template<class d>
|
||||||
|
struct scal {
|
||||||
|
d internal;
|
||||||
|
};
|
||||||
|
|
||||||
|
Gamma::GammaMatrix Gmu [] = {
|
||||||
|
Gamma::GammaX,
|
||||||
|
Gamma::GammaY,
|
||||||
|
Gamma::GammaZ,
|
||||||
|
Gamma::GammaT
|
||||||
|
};
|
||||||
|
|
||||||
|
void benchDw(std::vector<int> & L, int Ls, int threads, int report =0 );
|
||||||
|
void benchsDw(std::vector<int> & L, int Ls, int threads, int report=0 );
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
const int Ls=16;
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
|
if ( getenv("ASMOPT") ) {
|
||||||
|
QCD::WilsonKernelsStatic::AsmOpt=1;
|
||||||
|
} else {
|
||||||
|
QCD::WilsonKernelsStatic::AsmOpt=0;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s) "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
|
||||||
|
int Lmax=32;
|
||||||
|
if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
|
||||||
|
for (int L=8;L<Lmax;L*=2){
|
||||||
|
std::vector<int> latt4(4,L);
|
||||||
|
for(int d=4;d>0;d--){
|
||||||
|
if ( d<=3 ) latt4[d]*=2;
|
||||||
|
std::cout << GridLogMessage <<"\t";
|
||||||
|
for(int d=0;d<Nd;d++){
|
||||||
|
std::cout<<latt4[d]<<"x";
|
||||||
|
}
|
||||||
|
std::cout <<Ls<<"\t" ;
|
||||||
|
benchDw (latt4,Ls,threads,0);
|
||||||
|
benchsDw(latt4,Ls,threads,0);
|
||||||
|
std::cout<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
{
|
||||||
|
std::vector<int> latt4(4,16);
|
||||||
|
std::cout<<GridLogMessage << "16^4 Dw miss rate"<<std::endl;
|
||||||
|
benchDw (latt4,Ls,threads,1);
|
||||||
|
std::cout<<GridLogMessage << "16^4 sDw miss rate"<<std::endl;
|
||||||
|
benchsDw(latt4,Ls,threads,1);
|
||||||
|
}
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef CHECK
|
||||||
|
|
||||||
|
void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
|
||||||
|
{
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
|
#ifdef CHECK
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
LatticeFermion src (FGrid); random(RNG5,src);
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
random(RNG4,Umu);
|
||||||
|
#else
|
||||||
|
LatticeFermion src (FGrid); src=zero;
|
||||||
|
LatticeGaugeField Umu(UGrid); Umu=zero;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LatticeFermion result(FGrid); result=zero;
|
||||||
|
LatticeFermion ref(FGrid); ref=zero;
|
||||||
|
LatticeFermion tmp(FGrid);
|
||||||
|
LatticeFermion err(FGrid);
|
||||||
|
|
||||||
|
ColourMatrix cm = Complex(1.0,0.0);
|
||||||
|
|
||||||
|
|
||||||
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
|
|
||||||
|
// replicate across fifth dimension
|
||||||
|
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////
|
||||||
|
// Naive wilson implementation
|
||||||
|
////////////////////////////////////
|
||||||
|
std::vector<LatticeColourMatrix> U(4,FGrid);
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CHECK
|
||||||
|
if (1)
|
||||||
|
{
|
||||||
|
ref = zero;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
|
||||||
|
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||||
|
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||||
|
|
||||||
|
tmp =adj(U[mu])*src;
|
||||||
|
tmp =Cshift(tmp,mu+1,-1);
|
||||||
|
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||||
|
}
|
||||||
|
ref = -0.5*ref;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
|
||||||
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
|
||||||
|
|
||||||
|
if (ncall < 5 ) exit(0);
|
||||||
|
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
|
||||||
|
PerformanceCounter Counter(8);
|
||||||
|
Counter.Start();
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
Counter.Stop();
|
||||||
|
if ( report ) {
|
||||||
|
Counter.Report();
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( ! report )
|
||||||
|
{
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=1344*volume*ncall;
|
||||||
|
std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CHECK
|
||||||
|
err = ref-result;
|
||||||
|
RealD errd = norm2(err);
|
||||||
|
if ( errd> 1.0e-4 ) {
|
||||||
|
std::cout<<GridLogMessage << "oops !!! norm diff "<< norm2(err)<<std::endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LatticeFermion src_e (FrbGrid);
|
||||||
|
LatticeFermion src_o (FrbGrid);
|
||||||
|
LatticeFermion r_e (FrbGrid);
|
||||||
|
LatticeFermion r_o (FrbGrid);
|
||||||
|
LatticeFermion r_eo (FGrid);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,src_e,src);
|
||||||
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
|
{
|
||||||
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
double t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
if(!report){
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=(1344.0*volume*ncall)/2;
|
||||||
|
std::cout<< flops/(t1-t0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef CHECK_SDW
|
||||||
|
void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
|
||||||
|
{
|
||||||
|
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
|
||||||
|
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
|
#ifdef CHECK_SDW
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
LatticeFermion src (FGrid); random(RNG5,src);
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
random(RNG4,Umu);
|
||||||
|
#else
|
||||||
|
LatticeFermion src (FGrid); src=zero;
|
||||||
|
LatticeGaugeField Umu(UGrid); Umu=zero;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LatticeFermion result(FGrid); result=zero;
|
||||||
|
LatticeFermion ref(FGrid); ref=zero;
|
||||||
|
LatticeFermion tmp(FGrid);
|
||||||
|
LatticeFermion err(FGrid);
|
||||||
|
|
||||||
|
ColourMatrix cm = Complex(1.0,0.0);
|
||||||
|
|
||||||
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
|
|
||||||
|
// replicate across fifth dimension
|
||||||
|
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
|
||||||
|
typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
|
||||||
|
LatticeFermionF ssrc(sFGrid);
|
||||||
|
LatticeFermionF sref(sFGrid);
|
||||||
|
LatticeFermionF sresult(sFGrid);
|
||||||
|
WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
|
||||||
|
|
||||||
|
for(int x=0;x<latt4[0];x++){
|
||||||
|
for(int y=0;y<latt4[1];y++){
|
||||||
|
for(int z=0;z<latt4[2];z++){
|
||||||
|
for(int t=0;t<latt4[3];t++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
std::vector<int> site({s,x,y,z,t});
|
||||||
|
SpinColourVectorF tmp;
|
||||||
|
peekSite(tmp,src,site);
|
||||||
|
pokeSite(tmp,ssrc,site);
|
||||||
|
}}}}}
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
sDw.Dhop(ssrc,sresult,0);
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
|
||||||
|
|
||||||
|
PerformanceCounter Counter(8);
|
||||||
|
Counter.Start();
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
sDw.Dhop(ssrc,sresult,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
Counter.Stop();
|
||||||
|
|
||||||
|
if ( report ) {
|
||||||
|
Counter.Report();
|
||||||
|
} else {
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=1344*volume*ncall;
|
||||||
|
std::cout<<"\t"<< flops/(t1-t0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
LatticeFermionF sr_eo(sFGrid);
|
||||||
|
LatticeFermionF serr(sFGrid);
|
||||||
|
|
||||||
|
LatticeFermion ssrc_e (sFrbGrid);
|
||||||
|
LatticeFermion ssrc_o (sFrbGrid);
|
||||||
|
LatticeFermion sr_e (sFrbGrid);
|
||||||
|
LatticeFermion sr_o (sFrbGrid);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,ssrc_e,ssrc);
|
||||||
|
pickCheckerboard(Odd,ssrc_o,ssrc);
|
||||||
|
|
||||||
|
setCheckerboard(sr_eo,ssrc_o);
|
||||||
|
setCheckerboard(sr_eo,ssrc_e);
|
||||||
|
|
||||||
|
sr_e = zero;
|
||||||
|
sr_o = zero;
|
||||||
|
|
||||||
|
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||||
|
PerformanceCounter CounterSdw(8);
|
||||||
|
CounterSdw.Start();
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
CounterSdw.Stop();
|
||||||
|
|
||||||
|
if ( report ) {
|
||||||
|
CounterSdw.Report();
|
||||||
|
} else {
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=(1344.0*volume*ncall)/2;
|
||||||
|
std::cout<<"\t"<< flops/(t1-t0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -63,7 +63,7 @@ namespace Grid {
|
|||||||
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
|
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
|
||||||
assert(zdata->n==this->Ls);
|
assert(zdata->n==this->Ls);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
|
// std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
|
||||||
// Call base setter
|
// Call base setter
|
||||||
this->SetCoefficientsTanh(zdata,1.0,0.0);
|
this->SetCoefficientsTanh(zdata,1.0,0.0);
|
||||||
|
|
||||||
|
@ -53,6 +53,8 @@ namespace QCD {
|
|||||||
StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
|
StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
|
||||||
StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
|
StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
|
||||||
mass(_mass),
|
mass(_mass),
|
||||||
|
Lebesgue(_grid),
|
||||||
|
LebesgueEvenOdd(_cbgrid),
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd (&Hgrid)
|
UmuOdd (&Hgrid)
|
||||||
@ -228,7 +230,7 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
out.checkerboard = in.checkerboard;
|
out.checkerboard = in.checkerboard;
|
||||||
|
|
||||||
DhopInternal(Stencil,Umu,in,out,dag);
|
DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -239,7 +241,7 @@ PARALLEL_FOR_LOOP
|
|||||||
assert(in.checkerboard==Even);
|
assert(in.checkerboard==Even);
|
||||||
out.checkerboard = Odd;
|
out.checkerboard = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven,UmuOdd,in,out,dag);
|
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -250,7 +252,7 @@ PARALLEL_FOR_LOOP
|
|||||||
assert(in.checkerboard==Odd);
|
assert(in.checkerboard==Odd);
|
||||||
out.checkerboard = Even;
|
out.checkerboard = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd,UmuEven,in,out,dag);
|
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -285,7 +287,7 @@ PARALLEL_FOR_LOOP
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
@ -296,12 +298,12 @@ PARALLEL_FOR_LOOP
|
|||||||
if ( dag == DaggerYes ) {
|
if ( dag == DaggerYes ) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
|
Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
|
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -111,7 +111,7 @@ namespace Grid {
|
|||||||
const FermionField &B,
|
const FermionField &B,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
void DhopInternal(StencilImpl & st,LebesgueOrder & lo,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag) ;
|
const FermionField &in, FermionField &out,int dag) ;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
@ -146,6 +146,10 @@ namespace Grid {
|
|||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -321,14 +321,14 @@ PARALLEL_FOR_LOOP
|
|||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF=LLs*sU;
|
int sF=LLs*sU;
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF=LLs*sU;
|
int sF=LLs*sU;
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -38,20 +38,20 @@ template<class Impl>
|
|||||||
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
|
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
if ( AsmOpt ) {
|
if ( AsmOpt ) {
|
||||||
|
|
||||||
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
|
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
for(int site=0;site<Ns;site++) {
|
for(int site=0;site<Ns;site++) {
|
||||||
for(int s=0;s<Ls;s++) {
|
for(int s=0;s<Ls;s++) {
|
||||||
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
|
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
|
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
@ -61,17 +61,17 @@ void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
// No asm implementation yet.
|
// No asm implementation yet.
|
||||||
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
|
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
// else
|
// else
|
||||||
for(int site=0;site<Ns;site++) {
|
for(int site=0;site<Ns;site++) {
|
||||||
for(int s=0;s<Ls;s++) {
|
for(int s=0;s<Ls;s++) {
|
||||||
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
|
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
|
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
@ -84,7 +84,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
|
|||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -262,7 +262,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaug
|
|||||||
|
|
||||||
// Need controls to do interior, exterior, or both
|
// Need controls to do interior, exterior, or both
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
|
@ -53,11 +53,11 @@ namespace Grid {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
|
int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
@ -67,24 +67,24 @@ namespace Grid {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
// Specialised variants
|
// Specialised variants
|
||||||
void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU, const FermionField &in, FermionField &out);
|
int sF,int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in,FermionField &out);
|
int sF,int sU,const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF,int sU,const FermionField &in, FermionField &out);
|
||||||
public:
|
public:
|
||||||
|
@ -39,9 +39,9 @@ namespace QCD {
|
|||||||
// Default to no assembler implementation
|
// Default to no assembler implementation
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
@ -71,9 +71,9 @@ static int signInit = setupSigns();
|
|||||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#undef VMOVIDUP
|
#undef VMOVIDUP
|
||||||
@ -85,31 +85,31 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaug
|
|||||||
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
|
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
|
||||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
{
|
{
|
||||||
int locala,perma, ptypea;
|
int locala,perma, ptypea;
|
||||||
int localb,permb, ptypeb;
|
int localb,permb, ptypeb;
|
||||||
uint64_t basea, baseb;
|
int localc,permc, ptypec;
|
||||||
|
uint64_t basea, baseb, basec;
|
||||||
uint64_t basex;
|
uint64_t basex;
|
||||||
const uint64_t plocal =(uint64_t) & in._odata[0];
|
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||||
|
|
||||||
@ -11,14 +12,22 @@
|
|||||||
MASK_REGS;
|
MASK_REGS;
|
||||||
|
|
||||||
for(int site=0;site<Ns;site++) {
|
for(int site=0;site<Ns;site++) {
|
||||||
|
int sU=lo.Reorder(ssU);
|
||||||
|
|
||||||
for(int s=0;s<Ls;s++) {
|
for(int s=0;s<Ls;s++) {
|
||||||
|
ss =sU*Ls+s;
|
||||||
|
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Xp
|
// Xp
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
int ent=ss*8;// 2*Ndim
|
int ent=ss*8;// 2*Ndim
|
||||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basea);
|
||||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(baseb);
|
||||||
|
basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basec);
|
||||||
|
|
||||||
basex = basea;
|
basex = basea;
|
||||||
|
|
||||||
if ( locala ) {
|
if ( locala ) {
|
||||||
@ -38,6 +47,7 @@
|
|||||||
// Yp
|
// Yp
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basea);
|
||||||
if ( localb ) {
|
if ( localb ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
YM_PROJMEM(baseb);
|
YM_PROJMEM(baseb);
|
||||||
@ -46,7 +56,7 @@
|
|||||||
LOAD_CHI(baseb);
|
LOAD_CHI(baseb);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFYP(Yp,basea);
|
MULT_2SPIN_DIR_PFYP(Yp,basec);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
YM_RECON_ACCUM;
|
YM_RECON_ACCUM;
|
||||||
@ -55,15 +65,16 @@
|
|||||||
// Zp
|
// Zp
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
if ( locala ) {
|
PREFETCH_CHIMU(baseb);
|
||||||
|
if ( localc ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
ZM_PROJMEM(basea);
|
ZM_PROJMEM(basec);
|
||||||
MAYBEPERM(PERMUTE_DIR1,perma);
|
MAYBEPERM(PERMUTE_DIR1,permc);
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI(basea);
|
LOAD_CHI(basec);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFZP(Zp,baseb);
|
MULT_2SPIN_DIR_PFZP(Zp,basea);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
ZM_RECON_ACCUM;
|
ZM_RECON_ACCUM;
|
||||||
@ -71,16 +82,17 @@
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Tp
|
// Tp
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
|
||||||
if ( localb ) {
|
PREFETCH_CHIMU(basec);
|
||||||
|
if ( locala ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
TM_PROJMEM(baseb);
|
TM_PROJMEM(basea);
|
||||||
MAYBEPERM(PERMUTE_DIR0,permb);
|
MAYBEPERM(PERMUTE_DIR0,perma);
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI(baseb);
|
LOAD_CHI(basea);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFTP(Tp,basea);
|
MULT_2SPIN_DIR_PFTP(Tp,baseb);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
TM_RECON_ACCUM;
|
TM_RECON_ACCUM;
|
||||||
@ -88,16 +100,17 @@
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Xm
|
// Xm
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
|
||||||
if ( locala ) {
|
PREFETCH_CHIMU(basea);
|
||||||
|
if ( localb ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
XP_PROJMEM(basea);
|
XP_PROJMEM(baseb);
|
||||||
MAYBEPERM(PERMUTE_DIR3,perma);
|
MAYBEPERM(PERMUTE_DIR3,permb);
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI(basea);
|
LOAD_CHI(baseb);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFXM(Xm,baseb);
|
MULT_2SPIN_DIR_PFXM(Xm,basec);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
XP_RECON_ACCUM;
|
XP_RECON_ACCUM;
|
||||||
@ -105,13 +118,14 @@
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Ym
|
// Ym
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
|
||||||
if ( localb ) {
|
PREFETCH_CHIMU(baseb);
|
||||||
|
if ( localc ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
YP_PROJMEM(baseb);
|
YP_PROJMEM(basec);
|
||||||
MAYBEPERM(PERMUTE_DIR2,permb);
|
MAYBEPERM(PERMUTE_DIR2,permc);
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI(baseb);
|
LOAD_CHI(basec);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
||||||
@ -122,7 +136,8 @@
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Zm
|
// Zm
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basec);
|
||||||
if ( locala ) {
|
if ( locala ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
ZP_PROJMEM(basea);
|
ZP_PROJMEM(basea);
|
||||||
@ -140,6 +155,7 @@
|
|||||||
// Tm
|
// Tm
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
basea = (uint64_t)&out._odata[ss];
|
basea = (uint64_t)&out._odata[ss];
|
||||||
|
PREFETCH_CHIMU(basea);
|
||||||
if ( localb ) {
|
if ( localb ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
TP_PROJMEM(baseb);
|
TP_PROJMEM(baseb);
|
||||||
@ -148,17 +164,15 @@
|
|||||||
LOAD_CHI(baseb);
|
LOAD_CHI(baseb);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFTM(Tm,basea);
|
MULT_2SPIN_DIR_PFTM(Tm,basec);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
TP_RECON_ACCUM;
|
TP_RECON_ACCUM;
|
||||||
|
|
||||||
PREFETCH_CHIMU(basex);
|
// PREFETCH_CHIMU(basex);
|
||||||
SAVE_RESULT(&out._odata[ss]);
|
SAVE_RESULT(&out._odata[ss]);
|
||||||
|
|
||||||
|
|
||||||
ss++;
|
}
|
||||||
}
|
ssU++;
|
||||||
sU++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
163
lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
Normal file
163
lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
{
|
||||||
|
int locala,perma, ptypea;
|
||||||
|
int localb,permb, ptypeb;
|
||||||
|
uint64_t basea, baseb;
|
||||||
|
uint64_t basex;
|
||||||
|
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||||
|
|
||||||
|
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||||
|
vComplexF *isigns = &signs[0];
|
||||||
|
|
||||||
|
MASK_REGS;
|
||||||
|
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
int sU=lo.Reorder(ssU);
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
ss=sU*Ls+s;
|
||||||
|
////////////////////////////////
|
||||||
|
// Xp
|
||||||
|
////////////////////////////////
|
||||||
|
int ent=ss*8;// 2*Ndim
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basea);
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
basex = basea;
|
||||||
|
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXP(Xp,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_RECON;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Yp
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYP(Yp,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zp
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZP(Zp,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tp
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTP(Tp,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Xm
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXM(Xm,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Ym
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zm
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZM(Zm,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tm
|
||||||
|
////////////////////////////////
|
||||||
|
basea = (uint64_t)&out._odata[ss];
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTM(Tm,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_RECON_ACCUM;
|
||||||
|
|
||||||
|
SAVE_RESULT(&out._odata[ss]);
|
||||||
|
|
||||||
|
}
|
||||||
|
ssU++;
|
||||||
|
}
|
||||||
|
}
|
@ -312,7 +312,7 @@ namespace QCD {
|
|||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -555,7 +555,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -803,7 +803,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
|||||||
// Specialise Gparity to simple implementation
|
// Specialise Gparity to simple implementation
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -811,7 +811,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Dou
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -819,7 +819,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -827,7 +827,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Dou
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -839,44 +839,44 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
|
|||||||
////////////// Wilson ; uses this implementation /////////////////////
|
////////////// Wilson ; uses this implementation /////////////////////
|
||||||
// Need Nc=3 though //
|
// Need Nc=3 though //
|
||||||
|
|
||||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
@ -88,7 +88,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n"
|
#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n"
|
||||||
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
|
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
|
||||||
|
#define VPREFETCHP(O,A) "prefetcht1 "#O"*64("#A");\n"
|
||||||
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
|
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
|
||||||
|
#define VPREFETCHNTA(O,A)
|
||||||
|
#define VPREFETCH(O,A)
|
||||||
|
|
||||||
#define VEVICT(O,A)
|
#define VEVICT(O,A)
|
||||||
|
|
||||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
||||||
@ -124,8 +128,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||||
|
|
||||||
#define VPREFETCHNTA(O,A)
|
|
||||||
#define VPREFETCH(O,A)
|
|
||||||
|
|
||||||
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||||
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||||
|
@ -559,22 +559,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VSUB(UChi_02,result_22,result_22)\
|
VSUB(UChi_02,result_22,result_22)\
|
||||||
VSUB(UChi_12,result_32,result_32) );
|
VSUB(UChi_12,result_32,result_32) );
|
||||||
|
|
||||||
#define PREFETCH_CHIMU(A) \
|
#define PREFETCH_CHIMU(A)
|
||||||
|
/*
|
||||||
LOAD64(%r9,A) \
|
LOAD64(%r9,A) \
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
VPREFETCHG(12,%r9)\
|
VPREFETCHG(0,%r9)\
|
||||||
VPREFETCHG(13,%r9)\
|
VPREFETCHG(1,%r9)\
|
||||||
VPREFETCHG(14,%r9)\
|
VPREFETCHG(2,%r9)\
|
||||||
VPREFETCHG(15,%r9)\
|
VPREFETCHG(3,%r9)\
|
||||||
VPREFETCHG(16,%r9)\
|
VPREFETCHG(4,%r9)\
|
||||||
VPREFETCHG(17,%r9)\
|
VPREFETCHG(5,%r9)\
|
||||||
VPREFETCHG(18,%r9)\
|
VPREFETCHG(6,%r9)\
|
||||||
VPREFETCHG(19,%r9)\
|
VPREFETCHG(7,%r9)\
|
||||||
VPREFETCHG(20,%r9)\
|
VPREFETCHG(8,%r9)\
|
||||||
VPREFETCHG(21,%r9)\
|
VPREFETCHG(9,%r9)\
|
||||||
VPREFETCHG(22,%r9)\
|
VPREFETCHG(10,%r9)\
|
||||||
VPREFETCHG(23,%r9));
|
VPREFETCHG(11,%r9));
|
||||||
|
*/
|
||||||
#define PERMUTE_DIR0 __asm__ ( \
|
#define PERMUTE_DIR0 __asm__ ( \
|
||||||
VPERM0(Chi_00,Chi_00) \
|
VPERM0(Chi_00,Chi_00) \
|
||||||
VPERM0(Chi_01,Chi_01) \
|
VPERM0(Chi_01,Chi_01) \
|
||||||
@ -612,8 +613,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
LOAD64(%r8,ptr) \
|
LOAD64(%r8,ptr) \
|
||||||
LOAD64(%r9,pf) \
|
LOAD64(%r9,pf) \
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
VPREFETCH2(9,%r8) \
|
VPREFETCH2(9,%r8) VPREFETCH2(10,%r8) \
|
||||||
VPREFETCH2(10,%r8) \
|
|
||||||
VPREFETCH2(11,%r8) \
|
VPREFETCH2(11,%r8) \
|
||||||
VPREFETCH2(12,%r8) \
|
VPREFETCH2(12,%r8) \
|
||||||
VPREFETCH2(13,%r8) \
|
VPREFETCH2(13,%r8) \
|
||||||
|
@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
|
|||||||
{
|
{
|
||||||
grid = _grid;
|
grid = _grid;
|
||||||
if ( Block[0]==0) ZGraph();
|
if ( Block[0]==0) ZGraph();
|
||||||
|
else if ( Block[1]==0) NoBlocking();
|
||||||
else CartesianBlocking();
|
else CartesianBlocking();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void LebesgueOrder::NoBlocking(void)
|
||||||
|
{
|
||||||
|
std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
|
||||||
|
_LebesgueReorder.resize(0);
|
||||||
|
for ( int s = 0 ; s!= grid->oSites();s++){
|
||||||
|
_LebesgueReorder.push_back(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
void LebesgueOrder::CartesianBlocking(void)
|
void LebesgueOrder::CartesianBlocking(void)
|
||||||
{
|
{
|
||||||
_LebesgueReorder.resize(0);
|
_LebesgueReorder.resize(0);
|
||||||
|
|
||||||
std::cout << GridLogMessage << " CartesianBlocking ";
|
std::cout << GridLogDebug << " CartesianBlocking ";
|
||||||
for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
|
// for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
|
||||||
std::cout<<std::endl;
|
// std::cout<<std::endl;
|
||||||
|
|
||||||
IndexInteger ND = grid->_ndimension;
|
IndexInteger ND = grid->_ndimension;
|
||||||
|
|
||||||
@ -116,7 +125,8 @@ void LebesgueOrder::IterateI(int ND,
|
|||||||
void LebesgueOrder::ZGraph(void)
|
void LebesgueOrder::ZGraph(void)
|
||||||
{
|
{
|
||||||
_LebesgueReorder.resize(0);
|
_LebesgueReorder.resize(0);
|
||||||
|
|
||||||
|
std::cout << GridLogDebug << " Lebesgue order "<<std::endl;
|
||||||
// Align up dimensions to power of two.
|
// Align up dimensions to power of two.
|
||||||
const IndexInteger one=1;
|
const IndexInteger one=1;
|
||||||
|
|
||||||
|
@ -59,6 +59,7 @@ namespace Grid {
|
|||||||
// Cartesian stencil blocking strategy
|
// Cartesian stencil blocking strategy
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
static std::vector<int> Block;
|
static std::vector<int> Block;
|
||||||
|
void NoBlocking(void);
|
||||||
void CartesianBlocking(void);
|
void CartesianBlocking(void);
|
||||||
void IterateO(int ND,int dim,
|
void IterateO(int ND,int dim,
|
||||||
std::vector<IndexInteger> & xo,
|
std::vector<IndexInteger> & xo,
|
||||||
|
Loading…
Reference in New Issue
Block a user