mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-13 01:05:36 +00:00
Improvements to the assembler interface that let us move chunks of the
site and s loop into the kernels. This will save on function call overhead and guarantee L2 prefetching strategy is right since OMP can't distribute the sub-chunks of work.
This commit is contained in:
parent
d9408893b3
commit
55f65b81b5
@ -132,19 +132,21 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
RealD NP = UGrid->_Nprocessors;
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
|
||||||
for(int doasm=0;doasm<1;doasm++){
|
for(int doasm=1;doasm<2;doasm++){
|
||||||
|
|
||||||
QCD::WilsonKernelsStatic::AsmOpt=doasm;
|
QCD::WilsonKernelsStatic::AsmOpt=doasm;
|
||||||
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
||||||
int ncall =50;
|
int ncall =10;
|
||||||
if (1) {
|
if (1) {
|
||||||
|
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
|
__SSC_START;
|
||||||
Dw.Dhop(src,result,0);
|
Dw.Dhop(src,result,0);
|
||||||
|
__SSC_STOP;
|
||||||
}
|
}
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
|
|
||||||
|
@ -296,12 +296,12 @@ PARALLEL_FOR_LOOP
|
|||||||
if ( dag == DaggerYes ) {
|
if ( dag == DaggerYes ) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
|
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
|
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -95,11 +95,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
|
|
||||||
// Allocate the required comms buffer
|
// Allocate the required comms buffer
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
alltime=0;
|
|
||||||
commtime=0;
|
|
||||||
jointime=0;
|
|
||||||
dslashtime=0;
|
|
||||||
dslash1time=0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -291,33 +286,6 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::Report(void)
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
#if 0
|
|
||||||
std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Wilson5d time "<<alltime <<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "HaloBegin time "<<commtime <<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "HaloComplete time "<<jointime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil all gather time "<<Stencil.halogtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil splice gather time "<<Stencil.splicetime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil gather "<<Stencil.gathertime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil gather simd "<<Stencil.gathermtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil merge simd "<<Stencil.mergetime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil spin simd "<<Stencil.spintime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil MB/s "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil join time "<<Stencil.jointime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
||||||
const FermionField &A,
|
const FermionField &A,
|
||||||
@ -341,43 +309,29 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
|||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
alltime-=usecond();
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
|
||||||
int LLs = in._grid->_rdimensions[0];
|
int LLs = in._grid->_rdimensions[0];
|
||||||
|
|
||||||
commtime -=usecond();
|
|
||||||
st.HaloExchange(in,compressor);
|
st.HaloExchange(in,compressor);
|
||||||
commtime +=usecond();
|
|
||||||
|
|
||||||
jointime -=usecond();
|
|
||||||
jointime +=usecond();
|
|
||||||
|
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
dslashtime -=usecond();
|
|
||||||
if ( dag == DaggerYes ) {
|
if ( dag == DaggerYes ) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF=s+LLs*sU;
|
int sF=LLs*sU;
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
int sU=lo.Reorder(ss);
|
int sU=ss;
|
||||||
int sF=LLs*sU;
|
int sF=LLs*sU;
|
||||||
for(int s=0;s<LLs;s++){
|
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
sF++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dslashtime +=usecond();
|
|
||||||
alltime+=usecond();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
|
@ -60,11 +60,7 @@ namespace Grid {
|
|||||||
public:
|
public:
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef WilsonKernels<Impl> Kernels;
|
typedef WilsonKernels<Impl> Kernels;
|
||||||
double alltime;
|
|
||||||
double jointime;
|
|
||||||
double commtime;
|
|
||||||
double dslashtime;
|
|
||||||
double dslash1time;
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
@ -139,7 +135,6 @@ namespace Grid {
|
|||||||
// DoubleStore
|
// DoubleStore
|
||||||
void ImportGauge(const GaugeField &_Umu);
|
void ImportGauge(const GaugeField &_Umu);
|
||||||
|
|
||||||
void Report(void);
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Data members require to support the functionality
|
// Data members require to support the functionality
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
|
@ -40,23 +40,42 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,in,out);
|
if ( AsmOpt ) {
|
||||||
else if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
|
|
||||||
|
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
|
||||||
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
|
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
|
||||||
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
// No asm implementation yet.
|
// No asm implementation yet.
|
||||||
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
|
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
|
||||||
// else
|
// else
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
|
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
|
||||||
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
|
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
|
||||||
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -55,11 +55,11 @@ namespace Grid {
|
|||||||
|
|
||||||
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in,FermionField &out);
|
int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
@ -77,7 +77,7 @@ namespace Grid {
|
|||||||
|
|
||||||
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
@ -41,7 +41,7 @@ namespace QCD {
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
@ -73,7 +73,7 @@ static int signInit = setupSigns();
|
|||||||
template<>
|
template<>
|
||||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#undef VMOVIDUP
|
#undef VMOVIDUP
|
||||||
@ -87,29 +87,29 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaug
|
|||||||
template<>
|
template<>
|
||||||
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
@ -2,19 +2,24 @@
|
|||||||
int locala,perma, ptypea;
|
int locala,perma, ptypea;
|
||||||
int localb,permb, ptypeb;
|
int localb,permb, ptypeb;
|
||||||
uint64_t basea, baseb;
|
uint64_t basea, baseb;
|
||||||
|
uint64_t basex;
|
||||||
const uint64_t plocal =(uint64_t) & in._odata[0];
|
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||||
|
|
||||||
// vComplexF isigns[2] = { signs[0], signs[1] };
|
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||||
vComplexF *isigns = &signs[0];
|
vComplexF *isigns = &signs[0];
|
||||||
|
|
||||||
MASK_REGS;
|
MASK_REGS;
|
||||||
|
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Xp
|
// Xp
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
int ent=ss*8;// 2*Ndim
|
int ent=ss*8;// 2*Ndim
|
||||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
basex = basea;
|
||||||
|
|
||||||
if ( locala ) {
|
if ( locala ) {
|
||||||
LOAD64(%r10,isigns);
|
LOAD64(%r10,isigns);
|
||||||
@ -149,5 +154,8 @@
|
|||||||
TP_RECON_ACCUM;
|
TP_RECON_ACCUM;
|
||||||
|
|
||||||
SAVE_RESULT(&out._odata[ss]);
|
SAVE_RESULT(&out._odata[ss]);
|
||||||
|
ss++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -807,7 +807,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Dou
|
|||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@ -815,7 +815,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,
|
|||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@ -823,7 +823,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Dou
|
|||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@ -831,7 +831,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
|
|||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -631,6 +631,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VMADDSUB(Z1,T2,UChi_11) \
|
VMADDSUB(Z1,T2,UChi_11) \
|
||||||
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
|
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
|
||||||
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
|
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
|
||||||
|
VPREFETCH2(12,%r9) \
|
||||||
|
VPREFETCH2(13,%r9) \
|
||||||
|
VPREFETCH2(14,%r9) \
|
||||||
|
VPREFETCH2(15,%r9) \
|
||||||
|
VPREFETCH2(16,%r9) \
|
||||||
|
VPREFETCH2(17,%r9) \
|
||||||
|
VPREFETCH2(18,%r9) \
|
||||||
|
VPREFETCH2(19,%r9) \
|
||||||
|
VPREFETCH2(20,%r9) \
|
||||||
|
VPREFETCH2(21,%r9) \
|
||||||
|
VPREFETCH2(22,%r9) \
|
||||||
|
VPREFETCH2(23,%r9) \
|
||||||
/*38*/ \
|
/*38*/ \
|
||||||
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
|
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
|
||||||
VMADDSUB(Z3,Chi_11,UChi_10) \
|
VMADDSUB(Z3,Chi_11,UChi_10) \
|
||||||
|
Loading…
Reference in New Issue
Block a user